Supplemental code used for generating results for the manuscript Epigenetic blueprint identifies poor outcome and hypomethylating agent-responsive T-ALL subgroup. Following code is written in R. Please refer to the end of the document for sessionInfo regarding package versions.

Load libraries

#For data import/wrangling
library(data.table)
library(readxl)
#For Diff. Methylation and Expression analysis
library(limma)
library(DESeq2)
library(sva)
library(BiocParallel)
register(MulticoreParam(8))
#For enrichment analysis
library(goseq)
library(maftools)
library(GenomicRanges)
library(LOLA)
#For ontogeny analysis
library(ape)
#For Clustering
library(NMF)
library(fossil) #For rand.index function
library(umap) 
library(parallel)
#For survival
library(survival)
#For visualization
library(pheatmap)
suppressPackageStartupMessages(library(circlize))
suppressPackageStartupMessages(library(ComplexHeatmap))
library(RColorBrewer)
library(ggplot2)
library(peakSeason) #https://github.com/PoisonAlien/peakseason
#Modified pheatmap which avoids repetitive legends
source("code/pheatmap2.R")

#Accessory helpful functions
source("code/helper_functions.R")
#Wrapper function for running NMF
source("code/extract_componenets.R")

Load data

Raw IDAT files and processed beta value matrices are available on GEO under the accession GSE147667. Below analysis begins with the processed beta values.

#Processed Tumor and normal beta values - stored as ExpressionSet objects
tumor_data = readRDS(file = "data/t-alls_eset.RDs")
print(tumor_data)
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 779812 features, 143 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: sample_231 sample_234 ... sample_RAD_MI (143 total)
##   varLabels: Sample_Name Age ... HOXA_nd (52 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: cg26928153 cg16269199 ... cg02995750 (779812 total)
##   fvarLabels: rn Chr ... Relation_to_Island (12 total)
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
normal_data = readRDS(file = "data/thymic_normals_eset.RDs")
print(normal_data)
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 779812 features, 12 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: sam_CD34_1 sam_CD34_2 ... sam_SPo8_2 (12 total)
##   varLabels: Sample_Name Age ... HOXA_nd (52 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: cg26928153 cg16269199 ... cg02995750 (779812 total)
##   fvarLabels: rn Chr ... Relation_to_Island (12 total)
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
#Uniform color-codes for visualization.
color_codes = readRDS(file = "data/color_codes.RDs")

Dimension reduction and clustering of T-ALL samples

Remove probes on CNV regions

Remove probes located on CN altered regions such as amplicons or deletions. Such probes can lead to false methylation values thereby affecting the analysis. Each of the EPIC array was analyzed with conumee package and segmentation files were generated. Segmentation files from all the samples are further processed with GISTIC2.0 to identify genomic regions that are recurrently and significantly amplified or deleted across the cohort.

gistic_res = maftools::readGistic(gisticAllLesionsFile = "data/gistic/all_lesions.conf_90.txt", 
                     gisticAmpGenesFile = "data/gistic/amp_genes.conf_90.txt",
                      gisticDelGenesFile = "data/gistic/del_genes.conf_90.txt", 
                     gisticScoresFile = "data/gistic//scores.gistic")
## -Processing Gistic files..
## --Processing amp_genes.conf_90.txt
## --Processing del_genes.conf_90.txt
## --Processing scores.gistic
## --Summarizing by samples
#CNV region limits
cnv_regions = maftools::getCytobandSummary(x = gistic_res)[,.(Wide_Peak_Limits, Variant_Classification)]
cnv_regions[, chr := data.table::tstrsplit(x = Wide_Peak_Limits, split = ":")[[1]]]
cnv_regions[, region := data.table::tstrsplit(x = Wide_Peak_Limits, split = ":")[[2]]]    
cnv_regions[, start := data.table::tstrsplit(x = cnv_regions$region, split = "-")[[1]]]
cnv_regions[, end := data.table::tstrsplit(x = cnv_regions$region, split = "-")[[2]]]
cnv_regions[, region := NULL]
cnv_regions[, start := as.numeric(start)]
cnv_regions[, end := as.numeric(end)]
cnv_regions = cnv_regions[,.(chr, start, end, Variant_Classification, Wide_Peak_Limits)]
data.table::setkey(x = cnv_regions, chr, start, end)

#EPIC probe annotations
epic_anno = data.table::fread(input = "data/epic_genomic_annotations.tsv.gz")
epic_anno = epic_anno[,.(Chr, Start, End, Strand, rn)]
colnames(epic_anno)[1:3] = c('chr', 'start', 'end')
epic_anno[, start := as.numeric(start)]
epic_anno[, end := as.numeric(end)]
data.table::setkey(x = epic_anno, chr, start, end)

#Probes within CN altered probes
cn_probes = data.table::foverlaps(x = cnv_regions, y = epic_anno)
cn_probes = cn_probes[!duplicated(rn)]
message("Number of porbes on CN altered regions: ", nrow(cn_probes))
## Number of porbes on CN altered regions: 11363
print(cn_probes[,.N,Variant_Classification])
##    Variant_Classification    N
## 1:                    Del 8020
## 2:                    Amp 3343
cn_altered_probes = unique(cn_probes$rn)
#Remove copy-number altered probes
tumor_data_clean = exprs(tumor_data[!rownames(tumor_data) %in% cn_altered_probes, ])

#Estimate SD on tumor samples alone
tumor_data_clean = order_by_sds(mat = tumor_data_clean, keep_sd = FALSE)

#Using top 5% most variable data
n_rows = as.integer(nrow(tumor_data_clean) * 0.05) 

clust_dat = tumor_data_clean[1:n_rows,]

print(dim(clust_dat))
## [1] 38583   143

Non-negative Factroization

Here we use NMF to estimate optimal number of components from a mixture of cohort.

  • Run NMF for a range of possible components (e.g, N = 2 to 10) on variable number of probes (e.g, 5K, 10K, etc)
  • Measure a goodness of fit (Cophenetic correlation) for each component
  • Choose optimal number of components based on the maximum Cophenetic correlation. See Brunet et al. for details.
#Range of probes to try
trial_nrows = c(seq(5000, 35000, 5000), nrow(clust_dat))
print(trial_nrows)
## [1]  5000 10000 15000 20000 25000 30000 35000 38583
#estimate_components function sourced from `extract_componenets.R`
nmf_trial_nrows = lapply(trial_nrows, function(nrows) {
  nmf_run = estimate_componenets(
    X = as.matrix(x = clust_dat[1:nrows, ]),
    n_run = 5,
    n_threads = 8,
    n_max = 10
  )
  nmf_run
})
names(nmf_trial_nrows) = paste0("n_", trial_nrows)

Optimal number of components

Cophenetic correlation is a measure of goodness of fit.

row_cols = RColorBrewer::brewer.pal(n = 9, name = "YlOrRd")[2:9]
par(mar = c(4, 5, 1, 1), cex = 0.8)
plot(nmf_trial_nrows[[1]]$rank, nmf_trial_nrows[[1]]$cophenetic , axes = FALSE, pch = 16, col = row_cols[1], xlab = NA, ylab = NA, ylim = c(0.94, 1), xlim = c(2, 10))
abline(h = seq(0.94, 1, 0.01), v = 2:10, col = grDevices::adjustcolor(col = 'gray70', alpha.f = 0.7), lty = 2)
lines(nmf_trial_nrows[[1]]$rank, nmf_trial_nrows[[1]]$cophenetic, lwd = 1.2, col = row_cols[1])
points(nmf_trial_nrows[[1]]$rank, nmf_trial_nrows[[1]]$cophenetic, pch = 19, col = row_cols[1])

for(i in 2:length(nmf_trial_nrows)){
  lines(x = nmf_trial_nrows[[i]]$rank, nmf_trial_nrows[[i]]$cophenetic, col = row_cols[i])
  points(nmf_trial_nrows[[i]]$rank, nmf_trial_nrows[[i]]$cophenetic, pch = 19, col = row_cols[i])
}

axis(side = 1, at = 2:10, labels = 2:10, font = 1)
axis(side = 2, at = seq(0.94, 1, 0.01), font = 1, las = 2, cex = 1.4)

mtext(text = "Number of components", side = 1, line = 2.5, adj = 0, cex = 0.8)
mtext(text = "Cophenetic-correlation metric", side = 2, line = 3.5, adj = 0, cex = 0.8)

legend(x = 5, y = 0.97, legend = trial_nrows, col = row_cols, pch = 19, cex = 0.6)

Based on above elbow plot of Cophenetic-correlation metric, optimal number of components seem to be 5 - since after 5 there is a marginal change, followed by gradual decline.

Cluster stability

Re-run nmf at n=5 for above range of probes to access cluster stability

nmf_trial_nrows_k5 = lapply(trial_nrows, function(nrows) {
  nmf_run = extract_componenets(
    X = as.matrix(x = clust_dat[1:nrows, ]),
    n_run = 5,
    n_threads = 8, 
    n = 5
  )
  nmf_run
})
#Perform k-mean clustering on contributions
nmf_k5_clusters = lapply(nmf_trial_nrows_k5, function(x){
  set.seed(seed = 1024)
  x_clust = kmeans(x = t(x$contributions), centers = 5)
  x_clust$cluster
})

#Get cluster assignments from k-mean results
nmf_k5_clusters = data.frame(
  row.names = colnames(nmf_trial_nrows_k5[[1]]$contributions),
  cluster = data.frame(nmf_k5_clusters),
  stringsAsFactors = FALSE
)
colnames(nmf_k5_clusters) = paste0("nprobes_", trial_nrows)

Rand-Index to acess cluster integrity

Check how stable the clusters are by estimating Rand-index between all k-mean clustering results

#Pairwise rand-index estimation
interactions = sapply(1:ncol(nmf_k5_clusters), function(i) {
  sapply(1:ncol(nmf_k5_clusters), function(j) {
    fossil::rand.index(group1 = nmf_k5_clusters[, i], group2 = nmf_k5_clusters[, j])
  })
})
#Draw rand-index
rownames(interactions) = colnames(interactions) = colnames(nmf_k5_clusters)

interactions[lower.tri(x = interactions)] = NA
m <- nrow(interactions)
n <- ncol(interactions)

par(bty="n", mar = c(1, 4, 2, 5)+.1, las=2, tcl=-.33, cex = 0.6)
image(
  x = 1:n,
  y = 1:m,
  interactions,
  col = rev(RColorBrewer::brewer.pal(9, "PRGn")),
  xaxt = "n",
  yaxt = "n",
  xlab = "",
  ylab = "",
  xlim = c(0, n+2),
  ylim = c(0, n+2)
)
abline(h=0:n+.5, col="white", lwd=.5)
abline(v=0:n+.5, col="white", lwd=.5)

image(y = seq(0.5*nrow(interactions), 0.9*nrow(interactions), length.out = 8), x=rep(n,2)+c(2,2.5)+1, z=matrix(c(1:8), nrow=1), col = rev(RColorBrewer::brewer.pal(8,"PRGn")), add=TRUE)
atLims = seq(0.5*nrow(interactions), 0.9*nrow(interactions), length.out = 7)
atLimsLabs = round(seq(range(interactions, na.rm = T)[1], range(interactions, na.rm = T)[2], length.out = 7), 2)
axis(side = 4, at = atLims,  tcl = -.15, labels = atLimsLabs, las = 1, lwd = .5)
mtext(side = 4, at = median(atLims), "Rand Index", las = 3, cex = 0.9, line = 3, font = 2)

mtext(side = 2, at = 1:m, text = gsub(pattern = "nprobes_", replacement = "", x = colnames(interactions)), cex = 0.7, font = 1)
mtext(side = 3, at = 1:n, text = gsub(pattern = "nprobes_", replacement = "", x = colnames(interactions)), las = 2, cex = 0.7, font = 1, line = -2) 

Correlation matrix

This is a simple visualization to see the movement of samples for different clustering results.

#Use clusters from nprobes_38990 as a control
nmf_k5_clusters = nmf_k5_clusters[order(nmf_k5_clusters$nprobes_38583),,]

clust_dat = clust_dat[,rownames(nmf_k5_clusters)]
clust_dat_cor_mat = cor(x = clust_dat, method = "pearson")

#Cluster annotations
nmf_k5_clusters_anno = apply(nmf_k5_clusters, 2, function(x) paste0("K_", x))
rownames(nmf_k5_clusters_anno) = rownames(nmf_k5_clusters)

#Cluster colors
clust_cols = lapply(1:ncol(nmf_k5_clusters_anno), function(x) {
  clust_cols = RColorBrewer::brewer.pal(n = 5, name = "Set1")
  names(clust_cols) = paste0("K_", 1:5)
  clust_cols
})
names(clust_cols) = colnames(nmf_k5_clusters_anno)
clust_dat_cor_mat[lower.tri(x = clust_dat_cor_mat, diag = TRUE)] = NA
pheatmap(mat = clust_dat_cor_mat, cluster_rows = FALSE, cluster_cols = FALSE, show_rownames = FALSE, show_colnames = FALSE, annotation_col = nmf_k5_clusters, border_color = "white", fontsize = 7, indiv_legend = list(Clusters = c("nprobes_5000", "nprobes_10000", 'nprobes_15000', "nprobes_20000", "nprobes_25000", "nprobes_30000", "nprobes_35000")), annotation_colors = clust_cols, na_col = 'white')
## Loading required package: scales
## Loading required package: gtable

Final clusters

#Final clusters to be used in downstream analysis
data.table::setDT(x = nmf_k5_clusters, keep.rownames = TRUE)
final_clusters = nmf_k5_clusters[,.(rn, nprobes_38583)]
colnames(final_clusters) = c('Sample_Name', 'Cluster')

Rename clusters:

  • Use prefix C instead of K
  • Re-organize cluster numbers according to methylation level i.e, lowly methylated cluster becomes C1, C2, and so on..
mean_meth = colMeans(x = exprs(tumor_data), na.rm = TRUE)
mean_meth = data.table::data.table(Sample_Name = names(mean_meth), avg_meth = mean_meth, stringsAsFactors = FALSE)
mean_meth = merge(mean_meth, final_clusters)
mean_meth[,Cluster := paste0("C", Cluster)]

clust_order = mean_meth[,median(avg_meth), Cluster][order(V1, decreasing = FALSE)][,Cluster]
print(clust_order)
## [1] "C3" "C2" "C4" "C1" "C5"
#Rename and change order
mean_meth$Cluster = factor(x = mean_meth[,Cluster], levels = clust_order, labels = paste0("C", 1:5))

Final cluster sizes

mean_meth[,.N,Cluster][order(as.character(Cluster))]
##    Cluster  N
## 1:      C1 14
## 2:      C2 34
## 3:      C3 22
## 4:      C4 41
## 5:      C5 32

UMAP visualization

#Use nmf weights (at N = 38553 probes) for UMAP visualization
nmf_contributions = nmf_trial_nrows_k5[[8]]$contributions

umap_settings = umap.defaults
umap_settings$n_neighbors = 14
umap_settings$min_dist = 0.1
umap_settings$random_state = 1024
umap_settings$n_components = 2

#UMAP layout
umap_lo = umap::umap(d = t(nmf_contributions), config = umap_settings)
umap_lo = data.table::as.data.table(as.data.frame(umap_lo$layout), keep.rownames = TRUE)
colnames(umap_lo) = c('Sample_Name', 'UMAP1', 'UMAP2')

#Points are too close to visualize UMAP data; add random noise for better separation
set.seed(seed = 1024)
umap_lo$UMAP1 = jitter(umap_lo$UMAP1, amount = 1)
set.seed(seed = 1024)
umap_lo$UMAP2 = jitter(umap_lo$UMAP2, amount = 1)
#Add custer info
umap_lo = merge(umap_lo, mean_meth[,.(Sample_Name, Cluster)], by = 'Sample_Name')
clust_cols = c(C1 = "#E41A1C", C2 = "#377EB8", C3 = "#4DAF4A", C4 = "#984EA3", C5 = "#FF7F00")
umap_lo$Cluster_cols = clust_cols[as.character(umap_lo$Cluster)]

par(mar = c(1, 1, 1, 1))
plot(NA, pch = 21, axes = FALSE, xlab = NA, ylab = NA, cex = 1.2, xlim = range(umap_lo$UMAP1), ylim = range(umap_lo$UMAP2))
abline(h = seq(range(umap_lo$UMAP2)[1], range(umap_lo$UMAP2)[2], length.out = 5), col = "gray70", lty = 2)
abline(v = seq(range(umap_lo$UMAP1)[1], range(umap_lo$UMAP1)[2], length.out = 5), col = "gray70", lty = 2)
points(x = umap_lo$UMAP1, y = umap_lo$UMAP2, col = "black", bg = grDevices::adjustcolor(umap_lo$Cluster_cols, alpha.f = 0.8), pch = 21)
legend("bottomright", legend = names(clust_cols), col = clust_cols, pch = 19, cex = 0.6)

Add cluster information to ExpressionSet

data.table::setDF(mean_meth, mean_meth$Sample_Name)
mean_meth = mean_meth[tumor_data$Sample_Name,]
tumor_data$Cluster = as.factor(as.character(mean_meth$Cluster))

#Cleanup
rm(tumor_data_clean)
rm(clust_dat)
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  10223168  546.0   16481394  880.3  16481394  880.3
## Vcells 166383550 1269.5  557565041 4253.9 537680612 4102.2

Cluster characterisation

Correlation matrix

tumor_data = order_by_sds(tumor_data)
samp_order = order(as.character(tumor_data$Cluster))
clust_dat_cor_mat = cor(x = exprs(tumor_data)[1:n_rows, samp_order], method = "spearman")
clust_dat_cor_mat[lower.tri(x = clust_dat_cor_mat, diag = FALSE)] = NA
pheatmap::pheatmap(
    mat = clust_dat_cor_mat,
    cluster_rows = FALSE,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    show_colnames = FALSE,
    annotation_col = pData(tumor_data)[samp_order, "Cluster", drop = FALSE],
    border_color = "white",
    fontsize = 7,
    annotation_colors = color_codes["Cluster"], na_col = "white"
)

Average methylation

mean_meth_n = colMeans(x = exprs(normal_data), na.rm = TRUE)
mean_meth_n = data.table::data.table(Sample_Name = names(mean_meth_n), avg_meth = mean_meth_n, Cluster = "Normal", stringsAsFactors = FALSE)
mean_meth = data.table::rbindlist(list(mean_meth, mean_meth_n))

mean_meth$Cluster = factor(x = mean_meth$Cluster, levels = c("Normal", paste0("C", 1:5)))
mean_meth$Cluster_codes = color_codes$Cluster[as.character(mean_meth$Cluster)]

#changes between groups
pt_test = pairwise.t.test(x = mean_meth$avg_meth, g = mean_meth$Cluster, paired = FALSE)
print(pt_test)
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  mean_meth$avg_meth and mean_meth$Cluster 
## 
##    Normal  C1      C2      C3      C4    
## C1 1.0000  -       -       -       -     
## C2 1.0000  1.0000  -       -       -     
## C3 0.0031  0.0021  0.0016  -       -     
## C4 4.6e-07 1.0e-07 4.2e-10 0.0827  -     
## C5 2.3e-11 2.4e-12 3.2e-16 3.2e-05 0.0144
## 
## P value adjustment method: holm
grid_cols = "gray80"
par(mar = c(3, 3, 1, 1))
b = beanplot::beanplot(avg_meth ~ Cluster, data = mean_meth, xaxt="n", boxwex=0.5, outline=FALSE, lty=1, lwd = 1.4, outwex = 0,
        staplewex = 0, ylim = c(0.5, 1), axes = FALSE, border = "black", ll = 0.05, innerborder = NA, col = lapply(rev(color_codes$Cluster), function(x) c(x, "gray")))
abline(v = 1:6, col = grid_cols, lty = 2)
abline(h = seq(0.5, 1, 0.1), col = grid_cols, lty = 2)
axis(side = 2, at = seq(0.5, 1, 0.1), font = 1, las = 2, cex.axis = 0.8)
axis(side = 1, at = 1:length(b$names), font = 1, labels = b$names, cex.axis = 0.62)
mtext(text = "Cluster", side = 1, font = 1, line = 2, cex = 0.8)
mtext(text = expression(paste("Average ", beta, " value")), side = 2, font = 1, line = 2, cex = 0.8)
text(x = 1:length(b$n), y = 1, labels = table(mean_meth$Cluster), font = 3, cex = 0.8)

#Add significant values
rect(xleft = 1, ybottom = 0.75, xright = 4, ytop = 0.75, col = "black")
text(x = 2.5, y = 0.76, labels = "***", cex = 0.4) #round(pt_test$p.value["C3", "Normal"], digits = 3)

rect(xleft = 1, ybottom = 0.8, xright = 5, ytop = 0.8, col = "black")
text(x = 3, y = 0.81, labels = "***", cex = 0.4) #round(pt_test$p.value["C4", "Normal"], digits = 3)

rect(xleft = 1, ybottom = 0.86, xright = 6, ytop = 0.86, col = "black")
text(x = 3.5, y = 0.87, labels = "***", cex = 0.6) #round(pt_test$p.value["C5", "Normal"], digits = 3)

Cluster enrichments

#Simplify maturation arrest stages. Classify them into alpha-beta and gamma-delta llineage
tumor_data$Maturation_Arrest_Stage_simple = ifelse(
  test = tumor_data$Maturation_Arrest_Stage %in% c("IMG", "IMD", "IM0"),
  yes = "GD_lineage",
  no = ifelse(
    test = tumor_data$Maturation_Arrest_Stage %in% c("TCR_AB_Pos", "IMB"),
    yes = "AB_lineage",
    no = tumor_data$Maturation_Arrest_Stage
  )
)

table(tumor_data$Maturation_Arrest_Stage_simple, tumor_data$Maturation_Arrest_Stage)
##             
##              IM0 IMB IMD IMG TCR_AB_Pos TCR_GD_Pos
##   AB_lineage   0  66   0   0         14          0
##   GD_lineage   7   0  11  15          0          0
##   TCR_GD_Pos   0   0   0   0          0         14

Maturation Arrest Stage enrichments

samp_anno = pData(tumor_data)[,c("Sample_Name", "Maturation_Arrest_Stage_simple", "ETP_Phenotype", "Cluster")]
samp_anno$Sample_Name = rownames(samp_anno)
samp_anno$Maturation_Arrest_Stage_simple = ifelse(test = samp_anno$Maturation_Arrest_Stage_simple == "NA", yes = NA, no = samp_anno$Maturation_Arrest_Stage_simple)
samp_anno$ETP_Phenotype = ifelse(test = samp_anno$ETP_Phenotype == "NA", yes = NA, no = samp_anno$ETP_Phenotype)
clust_encrichments = cluster_enrichment(clust_tbl = samp_anno[, c("Sample_Name", "Cluster")],
                                   anno_tbl = samp_anno[, c("Sample_Name", "Maturation_Arrest_Stage_simple", "ETP_Phenotype")])
## Removed 16 NAs from Maturation_Arrest_Stage_simple
## Removed 18 NAs from ETP_Phenotype
#Remove irrelevent enrichments
clust_encrichments = clust_encrichments[!Feature_lvl %in% c("Negative", "Low", "No", "NA")]
#Significant enrichments
print(clust_encrichments[p_value < 0.01])
##       Cluster Feature_lvl n_featurLvl n_featurLvl2      p_value   OR_low
## 1: Cluster_C5         Yes       14/13        11/87 1.932119e-05 3.313091
## 2: Cluster_C1         Yes         7/4        18/96 1.116473e-03 2.528954
## 3: Cluster_C5  GD_lineage       18/11        15/83 2.203987e-06 3.732913
## 4: Cluster_C4  AB_lineage        32/4        48/43 7.470931e-05 2.597500
## 5: Cluster_C2  AB_lineage        28/3        52/44 1.441678e-04 2.550570
## 6: Cluster_C1  GD_lineage         9/3        24/91 2.601102e-04 3.054148
## 7: Cluster_C3  TCR_GD_Pos        6/13        8/100 7.174160e-03 1.702837
##    OR_high    cluster                        Feature
## 1:     Inf Cluster_C5                  ETP_Phenotype
## 2:     Inf Cluster_C1                  ETP_Phenotype
## 3:     Inf Cluster_C5 Maturation_Arrest_Stage_simple
## 4:     Inf Cluster_C4 Maturation_Arrest_Stage_simple
## 5:     Inf Cluster_C2 Maturation_Arrest_Stage_simple
## 6:     Inf Cluster_C1 Maturation_Arrest_Stage_simple
## 7:     Inf Cluster_C3 Maturation_Arrest_Stage_simple
#msa_data = samp_anno[colnames(clust_dat_cor_mat), c("Maturation_Arrest_Stage", "ETP_Phenotype", "Cluster")]
samp_anno$Cluster = as.character(samp_anno$Cluster)
msa_data = samp_anno[colnames(clust_dat_cor_mat), c("Maturation_Arrest_Stage_simple", "ETP_Phenotype", "Cluster")]
colnames(msa_data)[1] = 'Maturation_Arrest_Stage'
msa_data = msa_data[colnames(clust_dat_cor_mat),]

msa_data$Maturation_Arrest_Stage = ifelse(
  test = msa_data$Maturation_Arrest_Stage == "NA",
  yes = "ND",
  no = msa_data$Maturation_Arrest_Stage
)
msa_lvls = c("GD_lineage", "AB_lineage", "TCR_GD_Pos")

msa_matrix = data.frame(row.names = rownames(msa_data))
for(m in msa_lvls){
  msa_matrix = cbind(msa_matrix,
                     ifelse(
                       test = msa_data$Maturation_Arrest_Stage == m,
                       yes = 1,
                       no = ifelse(
                         test = msa_data$Maturation_Arrest_Stage == "ND",
                         yes = NA,
                         no = 0
                       )
                     ))
  colnames(msa_matrix)[ncol(msa_matrix)] = m
}
msa_matrix$ETP = ifelse(test = msa_data$ETP_Phenotype == "Yes", yes = 1, no = ifelse(test = msa_data$ETP_Phenotype == "No", yes = 0, no = NA))
msa_matrix = msa_matrix[,rev(c("ETP", msa_lvls))]
msa_matrix$Cluster = as.character(msa_data[rownames(msa_matrix), "Cluster"])
msa_matrix = msa_matrix[order(msa_matrix$Cluster, -msa_matrix$ETP, -msa_matrix$GD_lineage, -msa_matrix$AB_lineage, -msa_matrix$TCR_GD_Pos),]

nm2 = matrix(data = msa_matrix$Cluster, nrow = nrow(msa_matrix), ncol = 1)

msa_matrix = msa_matrix[,-which(colnames(msa_matrix) == "Cluster")]
msa_matrix[is.na(msa_matrix)] = 2 #Set NA to 2.
msa_matrix = as.matrix(msa_matrix)
#msa_matrix = cbind(cluster = as.numeric(gsub(pattern = "C", replacement = "", x = nm2))+2, msa_matrix)
clust_cols = color_codes$Cluster[1:5]

layout(mat = matrix(data = c(1, 2), nrow = 2), heights = c(5, 1))

par(mar = c(0, 4, 1, 1))
image(x = 1:nrow(msa_matrix), y = 1:ncol(msa_matrix), z = msa_matrix, col = c("white", "maroon", "gray"), axes = FALSE, xaxt="n", yaxt="n", xlab="", ylab="")
abline(h = (1:ncol(msa_matrix)) + 0.5, col = "white")
points(which(msa_matrix == 0, arr.ind = TRUE), pch=".", col = "black")
mtext(text = colnames(msa_matrix), side = 2, at = 1:ncol(msa_matrix), font = 3, line = 0.4, las = 2, cex = 0.7)
abline(v = (1:nrow(msa_matrix)) + 0.5, col = "white")

par(mar = c(1, 4, 0.2, 1))
image(x = 1:nrow(nm2), y = 1:ncol(nm2), z = as.matrix(as.numeric(gsub(pattern = "C", replacement = "", x = nm2))), col = clust_cols, axes = FALSE, xaxt="n", yaxt="n", xlab="", ylab="")
abline(h = (1:ncol(msa_matrix)) + 0.5, col = "white")
abline(v = (1:nrow(msa_matrix)) + 0.5, col = "white")

plotEnrichment(clust_encrichments, sortByCluster = TRUE, showlegend = FALSE, neg_ylim = 1, pthr = 0.01)

Genetic events enrichments

gen_matrix = pData(tumor_data)[,c("TAL1", "TLX1", "TLX3", "HOXA_trans", "HOXA_cis", "HOXA_nd", "Cluster")]
gen_matrix$Cluster = as.character(msa_data[rownames(gen_matrix), "Cluster"])
gen_matrix = gen_matrix[order(gen_matrix$Cluster, -gen_matrix$TAL1, -gen_matrix$TLX1, -gen_matrix$TLX3, -gen_matrix$HOXA_trans, -gen_matrix$HOXA_cis, -gen_matrix$HOXA_nd),]
gen_matrix[is.na(gen_matrix)] = 2

nm2 = matrix(data = gen_matrix$Cluster, nrow = nrow(gen_matrix), ncol = 1)

gen_matrix = gen_matrix[,-which(colnames(gen_matrix) == "Cluster")]
gen_matrix = as.matrix(gen_matrix)
layout(mat = matrix(data = c(1, 2), nrow = 2), heights = c(5, 1))

par(mar = c(0, 4, 1, 1))
image(x = 1:nrow(gen_matrix), y = 1:ncol(gen_matrix), z = gen_matrix, col = c("white", "maroon", "gray"), axes = FALSE, xaxt="n", yaxt="n", xlab="", ylab="")
abline(h = (1:ncol(gen_matrix)) + 0.5, col = "white")
points(which(gen_matrix == 0, arr.ind = TRUE), pch=".", col = "black")
mtext(text = colnames(gen_matrix), side = 2, at = 1:ncol(gen_matrix), font = 3, line = 0.4, las = 2, cex = 0.7)
abline(v = (1:nrow(gen_matrix)) + 0.5, col = "white")

par(mar = c(1, 4, 0.2, 1))
image(x = 1:nrow(nm2), y = 1:ncol(nm2), z = as.matrix(as.numeric(gsub(pattern = "C", replacement = "", x = nm2))), col = clust_cols, axes = FALSE, xaxt="n", yaxt="n", xlab="", ylab="")
abline(h = (1:ncol(msa_matrix)) + 0.5, col = "white")
abline(v = (1:nrow(msa_matrix)) + 0.5, col = "white")

temp_anno = pData(tumor_data)[,c("Sample_Name", "TAL1", "TLX1", "TLX3", "HOXA_trans", "HOXA_cis", "HOXA_nd", "Cluster")]
tf_enrich = cluster_enrichment(clust_tbl = temp_anno[,c("Sample_Name", "Cluster")], anno_tbl = temp_anno[,c("Sample_Name", "TAL1", "TLX1", "TLX3", "HOXA_trans", "HOXA_cis", "HOXA_nd")])
## Removed 6 NAs from TAL1
## Removed 6 NAs from TLX1
## Removed 6 NAs from TLX3
## Removed 15 NAs from HOXA_trans
## Removed 15 NAs from HOXA_cis
## Removed 15 NAs from HOXA_nd
tf_enrich = tf_enrich[Feature_lvl %in% 1]
tf_enrich[,Feature_lvl := Feature]

#Results
print(tf_enrich[p_value < 0.01])
##       Cluster Feature_lvl n_featurLvl n_featurLvl2      p_value    OR_low
## 1: Cluster_C4    HOXA_cis        7/30         0/91 1.089171e-04  5.263243
## 2: Cluster_C1     HOXA_nd         4/7        7/110 7.352185e-03  1.948147
## 3: Cluster_C5  HOXA_trans       16/14         2/96 1.853806e-10 12.773460
## 4: Cluster_C2        TAL1       16/17        1/103 5.815153e-11 15.730688
## 5: Cluster_C4        TLX1       25/13         3/96 7.574931e-15 17.677091
## 6: Cluster_C3        TLX3        14/8        5/110 5.715071e-10 11.418244
##    OR_high    cluster    Feature
## 1:     Inf Cluster_C4   HOXA_cis
## 2:     Inf Cluster_C1    HOXA_nd
## 3:     Inf Cluster_C5 HOXA_trans
## 4:     Inf Cluster_C2       TAL1
## 5:     Inf Cluster_C4       TLX1
## 6:     Inf Cluster_C3       TLX3
tf_enrich = tf_enrich[!Feature_lvl %in% "HOXA_nd" ] #Remove undetermined HOXA enrichments
plotEnrichment(res = tf_enrich, sortByCluster = TRUE, showlegend = FALSE, neg_ylim = 0.5, pthr = 0.01)

Somatic landscape

Pathways

maf_anno = data.table::copy(x = pData(tumor_data))
colnames(maf_anno)[1] = 'Tumor_Sample_Barcode'

#Read variants
vc_color_codes = c(
  "Mutated" = "#33A02CFF",
  "Amplification" = "#1F78B4FF",
  'Deletion' = "#6A3D9AFF",
  "Multi_Hit" = 'black'
)

#Missing samples: 465 and 616
tall_maf = maftools::read.maf(maf = "data/targetted_NGS/TALL.maf",
                              clinicalData = maf_anno,
                              vc_nonSyn = c("Mutated", "Amplification", "Deletion"), removeDuplicatedVariants = FALSE)
## -Reading
## -Validating
## --Non MAF specific values in Variant_Classification column:
##   Amplification
##   Deletion
##   Mutated
## -Summarizing
## -Processing clinical data
## -Finished in 0.063s elapsed (0.063s cpu)
tall_maf
## An object of class  MAF 
##               ID summary  Mean Median
## 1:    NCBI_Build      NA    NA     NA
## 2:        Center      NA    NA     NA
## 3:       Samples     143    NA     NA
## 4:        nGenes      69    NA     NA
## 5: Amplification      31 0.217      0
## 6:      Deletion     188 1.315      1
## 7:       Mutated     541 3.783      3
## 8:         total     760 5.315      5
#Gene -> Pathway mappings
pathways = data.table::fread(input = "data/targetted_NGS/Leukemic_pathways.tsv")
path_names = names(table(pathways$Group))
pathways$name = factor(x = pathways$Group, levels = path_names, labels = c("CellCycle", "DNAm", "Epigenomic", "JAKSTAT", "Notch", "Other", "PI3KAKT", "Polycomb", 'RAS', "Transcription"))
path_order = c("Notch", "CellCycle", "PI3KAKT", "RAS", "JAKSTAT", "DNAm", "Polycomb", "Epigenomic", "Transcription", "Other")

gs = getGeneSummary(x = tall_maf)[,.(Hugo_Symbol, AlteredSamples)]
pathways = merge(gs, pathways)
pathways = lapply(split(pathways, as.factor(pathways$name))[path_order], function(x){
  x[order(AlteredSamples, decreasing = TRUE)]
})
pathways = data.table::rbindlist(pathways)
maftools::oncoplot(tall_maf, pathways = pathways[,.(Hugo_Symbol, name)], drawRowBar = FALSE, drawColBar = FALSE, clinicalFeatures = "Cluster", annotationColor = list(Cluster = clust_cols), sortByAnnotation=TRUE, colors = vc_color_codes, fontSize=0.4, annotationOrder = c("C1", "C2", "C3", "C4", "C5"), anno_height = 0.5, legend_height = 2, showTitle = FALSE)

Cluster specific enrichments

cluster_enrich = clinicalEnrichment(maf = tall_maf, clinicalFeature = "Cluster")
## Sample size per factor in Cluster:
## 
## C1 C2 C3 C4 C5 
## 14 34 22 41 32
#Significant enrichments
cluster_enrich$groupwise_comparision[fdr < 0.05]
##     Hugo_Symbol Group1 Group2 n_mutated_group1 n_mutated_group2      p_value
##  1:      CDKN2A     C5   Rest          7 of 32        92 of 111 2.669157e-10
##  2:       SUZ12     C5   Rest         15 of 32         8 of 111 1.218342e-06
##  3:        PHF6     C2   Rest          2 of 34        53 of 109 2.016031e-06
##  4:      DNMT3A     C1   Rest          9 of 14        11 of 129 4.277816e-06
##  5:      CDKN2A     C4   Rest         39 of 41        60 of 102 6.191327e-06
##  6:      CDKN2A     C2   Rest         33 of 34        66 of 109 1.249673e-05
##  7:        IDH2     C1   Rest          5 of 14         1 of 129 2.446480e-05
##  8:         NF1     C5   Rest         10 of 32         4 of 111 4.685076e-05
##  9:      NOTCH1     C4   Rest         37 of 41        58 of 102 7.680197e-05
## 10:        JAK3     C5   Rest         13 of 32        11 of 111 1.721430e-04
## 11:       SUZ12     C4   Rest          0 of 41        23 of 102 2.584708e-04
## 12:        PTEN     C2   Rest         11 of 34         8 of 109 6.040793e-04
## 13:        PHF6     C4   Rest         25 of 41        30 of 102 6.143953e-04
## 14:        PHF6     C1   Rest          0 of 14        55 of 129 9.275042e-04
## 15:        JAK3     C2   Rest          0 of 34        24 of 109 1.121977e-03
## 16:      STAT5B     C3   Rest          5 of 22         3 of 121 2.292750e-03
## 17:         WT1     C3   Rest          7 of 22         8 of 121 2.298464e-03
## 18:      BCL11B     C4   Rest         12 of 41         9 of 102 3.396357e-03
## 19:      DNMT3A     C2   Rest          0 of 34        20 of 109 3.874694e-03
## 20:        EZH2     C5   Rest          6 of 32         3 of 111 4.179499e-03
## 21:        DNM2     C2   Rest          1 of 34        27 of 109 5.198835e-03
##     Hugo_Symbol Group1 Group2 n_mutated_group1 n_mutated_group2      p_value
##              OR      OR_low      OR_high          fdr
##  1:  0.05950987 0.018839122    0.1660150 4.270651e-08
##  2: 11.06857487 3.757354339   35.3367322 9.746739e-05
##  3:  0.06701084 0.007407585    0.2847145 1.075217e-04
##  4: 18.50610889 4.660235094   83.8336974 1.711126e-04
##  5: 13.46576850 3.182464883  121.4332414 1.981225e-04
##  6: 21.21639436 3.290439607  890.8298588 3.332461e-04
##  7: 65.64336057 6.446071882 3303.6500748 5.591954e-04
##  8: 11.85071806 3.079422202   56.6029742 9.370152e-04
##  9:  6.93484630 2.247323230   28.8001954 1.365368e-03
## 10:  6.11260917 2.173958129   17.6856582 2.754289e-03
## 11:  0.00000000 0.000000000    0.3600663 3.759576e-03
## 12:  5.93725003 1.932418018   19.1460753 7.561788e-03
## 13:  3.71175929 1.644834256    8.6161540 7.561788e-03
## 14:  0.00000000 0.000000000    0.4306937 1.060005e-02
## 15:  0.00000000 0.000000000    0.4490493 1.196775e-02
## 16: 11.22704508 1.985395740   78.9098421 2.163261e-02
## 17:  6.45691132 1.731091376   23.9120694 2.163261e-02
## 18:  4.22381857 1.468207770   12.6229066 3.018984e-02
## 19:  0.00000000 0.000000000    0.5716395 3.262900e-02
## 20:  8.14068178 1.613490446   53.6327802 3.343599e-02
## 21:  0.09296741 0.002187172    0.6134001 3.961017e-02
##              OR      OR_low      OR_high          fdr
writexl::write_xlsx(x = cluster_enrich$groupwise_comparision, path = "Supplementary_table_S1.xlsx", col_names = TRUE, format_headers = TRUE)
clust_enrich_genes = unique(cluster_enrich$groupwise_comparision[fdr < 0.05, Hugo_Symbol])
#Order results based on pathways
clust_enrich_genes = pathways[Hugo_Symbol %in% clust_enrich_genes][order(Group, -AlteredSamples)]
maftools::oncoplot(
    maf = tall_maf,
    colors = vc_color_codes,
    drawColBar = FALSE,
    showTitle = FALSE,
    drawRowBar = FALSE,
    top = 50,
    clinicalFeatures = c("Cluster"),
    sortByAnnotation = TRUE,
    annotationOrder = c("C1", "C2", "C3", 'C4', "C5"),
    genes = clust_enrich_genes[, Hugo_Symbol],
    annotationColor = list(Cluster = clust_cols),
    keepGeneOrder = TRUE, sepwd_genes = 1, sepwd_samples = 1, 
    legendFontSize = 1, annotationFontSize = 1, removeNonMutated = FALSE, showTumorSampleBarcodes = FALSE)

DNMT3a and IDH2 variants

#Define color codes for variant types
vc_cols = maftools:::get_vcColors()[c(2,4,8,10,13)]
names(vc_cols) = c("coding", "splicing", "stop", "frameshift", "non-frameshift")
vc_cols = c(vc_cols, Multi_Hit = "black", Amp = "red", Del = 'royalblue')

dnmt3a_idh2 = maftools::read.maf(maf = "data/targetted_NGS/DNMT3A_IDH2.maf", vc_nonSyn = names(vc_cols), verbose = FALSE)
lollipopPlot(maf = dnmt3a_idh2, gene = "DNMT3A", colors = vc_cols, refSeqID = "NM_175629", labelPos = 882, collapsePosLabel = TRUE, legendTxtSize = 0.6, axisTextSize = c(0.5, 0.5), domainLabelSize = 0.6, pointSize = 1, titleSize = c(0.8, 0.6))

lollipopPlot(maf = dnmt3a_idh2, gene = "IDH2", colors = vc_cols, refSeqID = "NM_002168", labelPos = 140, collapsePosLabel = TRUE, legendTxtSize = 0.6, axisTextSize = c(0.5, 0.5), domainLabelSize = 0.6, pointSize = 1, titleSize = c(0.8, 0.6))

Thymopoiesis

Differential methylation

Identify DMPs per cell-type

normal_anno = pData(normal_data)[, c("Sample_Name", "Cell_Type")]
normal_anno$Cell_Type = factor(
  x = normal_anno$Cell_Type,
  levels = c("CD34", "ISP", "TCRn", "TCRp", "SPo4", "SPo8")
)

design = model.matrix(~0+normal_anno$Cell_Type)
colnames(design) = levels(normal_anno$Cell_Type)

#See here for design matrix details: https://support.bioconductor.org/p/26251/
contrast_matrix =
  limma::makeContrasts(CD34 - (ISP+SPo4+SPo8+TCRn+TCRp)/5,
                       ISP - (CD34+SPo4+SPo8+TCRn+TCRp)/5,
                       TCRn - (CD34+SPo4+SPo8+ISP+TCRp)/5,
                       TCRp - (CD34+SPo4+SPo8+ISP+TCRn)/5,
                       SPo4 - (CD34+TCRp+SPo8+ISP+TCRn)/5,
                       SPo8 - (CD34+TCRp+SPo4+ISP+TCRn)/5,
                       levels = design)

fit = limma::lmFit(object = exprs(normal_data), design = design)
fit2 = limma::contrasts.fit(fit = fit, contrasts = contrast_matrix)
fit2 = limma::eBayes(fit = fit2)

normal_ct_res = lapply(1:6, function(i){
  x = limma::topTable(fit2,
  coef = i,
  adjust = "BH",
  number = "all")
  data.table::setDT(x = x, keep.rownames = TRUE)
})
names(normal_ct_res) = c("CD34", "ISP", "TCRn", "TCRp", "SPo4", "SPo8")
#Add CpG annotations to limma results
epic_anno = data.table::as.data.table(fData(normal_data))
normal_ct_res = lapply(normal_ct_res, function(x){
  x = merge(x, epic_anno)
  x
})

sup_table_s2 = lapply(normal_ct_res, function(x){
  x = x[order(adj.P.Val)][adj.P.Val < 0.05 & abs(logFC) > 0.1]
  x
})
writexl::write_xlsx(x = sup_table_s2, path = "Supplementary_table_S2.xlsx", col_names = TRUE, format_headers = TRUE)

#Extract all unique DM probes
normal_ct_res_probes = lapply(normal_ct_res, function(x){
  x = x[order(adj.P.Val)][adj.P.Val < 0.05 & abs(logFC) > 0.1, rn]
  x
})

#Number of DMPs (hyper+hypo) per cell-type
unlist(lapply(normal_ct_res_probes, length))
##  CD34   ISP  TCRn  TCRp  SPo4  SPo8 
## 13333  5353  1529  1304 10221 12054

Overlap of thymic DMPs

fit2_res_probes = lapply(normal_ct_res, function(x){
  x = x[order(adj.P.Val)][adj.P.Val < 0.05 & abs(logFC) > 0.1, rn]
  x
})

olaps = ComplexHeatmap::make_comb_mat(fit2_res_probes)
ComplexHeatmap::UpSet(m = olaps, pt_size = unit(1.5, "mm"), lwd = 1, row_names_gp = gpar(fontsize = 8), set_order = c("CD34", "ISP", "TCRn", "TCRp", "SPo4", "SPo8"))

normal_ct_res_probes_uniq = unique(unlist(normal_ct_res_probes))
print(length(normal_ct_res_probes_uniq))
## [1] 23069
normal_thymic_specific_dat = order_by_sds(normal_data[normal_ct_res_probes_uniq,])
print(dim(normal_thymic_specific_dat))
## Features  Samples 
##    23069       12
pcaMset(normal_thymic_specific_dat, phenoCol="Cell_Type", phenoColors=color_codes$Cell_Type[1:6], legendPos = "bottom", legendNcol = 1)

Gene onotology for thymic DMPs

#Background ensemble IDs (all unique DMPs)
bg_ens_ids = unique(epic_anno[rn %in%  normal_ct_res_probes_uniq, Nearest_Ens])
bg_ens_ids = bg_ens_ids[!bg_ens_ids %in% ""]
#print(length(bg_ens_ids))

#Run GO for each celltype DMPs
normal_ct_go = lapply(normal_ct_res, function(x){
  xens = unique(x[adj.P.Val < 0.05 & abs(logFC) > 0.1, Nearest_Ens])
  xens = xens[!xens %in% ""]
  xgo = goseq_wrapper(assayedGenes = bg_ens_ids, deGenes = xens)
  xgo[, log_pvalue := -log10(over_represented_pvalue)]
  xgo
})
layout(mat = matrix(1:6, nrow = 2, ncol = 3, byrow = TRUE), widths = c(2, 2))
par(mar = c(2, 1, 1, 1))

for(i in 1:6){
  x = normal_ct_go[[i]][1:10]
  x_lims = ceiling(max(x[,log_pvalue]))
  plot(NA, xlim = c(0, x_lims), ylim = c(0, 10), axes = FALSE, xlab = NA, ylab = NA)
  #abline(v = seq(0, x_lims, by = 1), col = 'gray90', lty = 2)
  for(j in 1:nrow(x)){
    rect(
      xleft = x[j, log_pvalue],
      ybottom = j - 1,
      xright = 0,
      ytop = j,
      col = grDevices::adjustcolor(col = "black", alpha.f = 0.2),
      border = "white"
    )
  }
  axis(side = 1, at = pretty(c(0, x_lims)))
  abline(v = 2, lty = 2, col = 'maroon')
  title(main = names(normal_ct_go)[i], adj = 0, font.main = 4)
  text(x = rep(0, nrow(x)), y = seq(0.5, nrow(x)-0.5, 1), labels = x[, term], adj = 0, xpd = TRUE, col = 'black', font = 1)
}

Thymic DMP sub-classes (tDMPs)

Might look slightly different from the publication due to seed values

#Based on visual inspection of heatmap `k = 8`
k_centers = 8
set.seed(1024)
km8 = kmeans(x = normal_thymic_specific_dat, centers = k_centers)
km8_clust_assn = as.data.frame(km8$cluster)
colnames(km8_clust_assn)[1] = 'cluster'
km8_clust_assn$cluster = paste0("K", km8_clust_assn$cluster)
km8_clust_assn$cluster = factor(km8_clust_assn$cluster, levels = paste0("K", 1:k_centers))

thymic_ct_cols = RColorBrewer::brewer.pal(n = 6, name = "Dark2")
names(thymic_ct_cols) = c("CD34", "ISP", "TCRn", "TCRp", "SPo4", "SPo8")

#Draw heatmap
col_fun = circlize::colorRamp2(breaks = seq(0, 0.99, 0.01),
                               colors = grDevices::colorRampPalette(rev(
                                 RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")
                               ))(100))
col_anno = ComplexHeatmap::HeatmapAnnotation(df = normal_anno[, c("Cell_Type"), drop = FALSE], col = list(sample_type = thymic_ct_cols))
ComplexHeatmap::Heatmap(
  matrix = as.matrix(normal_thymic_specific_dat),
  row_split = km8_clust_assn,
  show_row_names = FALSE,
  top_annotation = col_anno,
  col = col_fun,
  column_gap = grid::unit(10, "mm"),
  border = TRUE, show_column_names = FALSE
)
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.
## 'magick' package is suggested to install to give better rasterization.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

#Avg meth (by replicates)
normal_thymic_specific_dat_avg = lapply(seq(1, 12, 2), function(i){
  rowMeans(exprs(normal_thymic_specific_dat[,c(i, i+1)]))
})
normal_thymic_specific_dat_avg = as.data.frame(normal_thymic_specific_dat_avg)
colnames(normal_thymic_specific_dat_avg) = c("CD34", "ISP", "TCRn", "TCRp", "SPo4", "SPo8")

#Draw a line group per cluster
normal_thymic_specific_dat_avg = normal_thymic_specific_dat_avg[rownames(km8_clust_assn),]
normal_thymic_specific_dat_avg$Cluster = km8_clust_assn$cluster
data.table::setDT(x = normal_thymic_specific_dat_avg, keep.rownames = TRUE)
normal_thymic_specific_dat_avg = data.table::melt(data = normal_thymic_specific_dat_avg)
## Warning in melt.data.table(data = normal_thymic_specific_dat_avg): id.vars
## and measure.vars are internally guessed when both are 'NULL'. All non-numeric/
## integer/logical type columns are considered id.vars, which in this case are
## columns [rn, Cluster, ...]. Consider providing at least one of 'id' or 'measure'
## vars in future.
normal_thymic_specific_dat_avg = normal_thymic_specific_dat_avg[,mean(value), .(Cluster, variable)]
normal_thymic_specific_dat_avg = data.table::dcast(data = normal_thymic_specific_dat_avg, variable ~ Cluster, value.var = 'V1')
data.table::setDF(x = normal_thymic_specific_dat_avg, rownames = as.character(normal_thymic_specific_dat_avg$variable))
normal_thymic_specific_dat_avg = normal_thymic_specific_dat_avg[,-1]
normal_thymic_specific_dat_avg = t(normal_thymic_specific_dat_avg)
temp_col_pal = RColorBrewer::brewer.pal(n = 8, name = 'Dark2')

#Draw a line plot for dynamics of methylation per `k`
par(mar = c(2, 2, 1, 1), oma = c(0, 0, 0, 1))
plot(NA, xlim = c(1, 6), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA)
abline(h = seq(0, 1, 0.2), v = 1:6, col = 'gray70', lty = 2)
for(i in 1:nrow(normal_thymic_specific_dat_avg)){
  ri = normal_thymic_specific_dat_avg[i,]
  points(1:6, y = ri, type = 'l', col = temp_col_pal[i])
  points(1:6, y = ri, pch = 21, bg = temp_col_pal[i], col = 'black', cex = 0.6)
}
axis(side = 1, at = 1:6, labels = colnames(normal_thymic_specific_dat_avg), cex.axis = 0.6)
axis(side = 2, at = seq(0, 1, 0.2), las = 2, cex.axis = 0.8)

par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE)
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n")
legend(x = "topright", legend = rownames(normal_thymic_specific_dat_avg), col = temp_col_pal, lty = 1, xpd = TRUE, cex = 0.6)

#In manuscript K = tDMP

GO for every tDMP

data.table::setDT(x = km8_clust_assn, keep.rownames = TRUE)
k_go_res = lapply(levels(km8_clust_assn$cluster), function(k){
  k_ens = unique(epic_anno[rn %in% km8_clust_assn[cluster %in% k, rn], Nearest_Ens])
  k_ens = k_ens[!k_ens %in% ""]
  k_go = goseq_wrapper(assayedGenes = bg_ens_ids, deGenes = k_ens)
  k_go[, log_pvalue := -log10(over_represented_pvalue)]
  k_go
})
names(k_go_res) = levels(km8_clust_assn$cluster)

#Sup-table 3
sup_table_s3 = lapply(X = split(km8_clust_assn, as.factor(km8_clust_assn$cluster)), FUN = function(x){
  merge(x, epic_anno, by = 'rn')
})
writexl::write_xlsx(x = sup_table_s3, path = "Supplementary_table_S3.xlsx", col_names = TRUE, format_headers = TRUE)
layout(mat = matrix(1:8, nrow = 2, ncol = 4, byrow = TRUE))
par(mar = c(2, 1, 1, 1))
for(i in 1:8){
  x = k_go_res[[i]][1:10]
  x_lims = ceiling(max(x[,log_pvalue]))
  plot(NA, xlim = c(0, x_lims), ylim = c(0, 10), axes = FALSE, xlab = NA, ylab = NA)
  #abline(v = seq(0, x_lims, by = 1), col = 'gray90', lty = 2)
  for(j in 1:nrow(x)){
    rect(
      xleft = x[j, log_pvalue],
      ybottom = j - 1,
      xright = 0,
      ytop = j,
      col = grDevices::adjustcolor(col = "black", alpha.f = 0.2),
      border = "white"
    )
  }
  axis(side = 1, at = pretty(c(0, x_lims)))
  abline(v = 2, lty = 2, col = 'maroon')
  title(main = names(k_go_res)[i], adj = 0, font.main = 4)
  text(x = rep(0, nrow(x)), y = seq(0.5, nrow(x)-0.5, 1), labels = x[, term], adj = 0, xpd = TRUE, col = 'black', font = 2)
}

#In manuscript K = tDMP

Thymic developmental tree

normal_ct_phyl_data = normal_data[normal_ct_res_probes_uniq,]
normal_ct_phyl_dist = stats::dist(t(exprs(normal_ct_phyl_data)), method = "man")
traj_ip_normal_dist_nj = ape::nj(X = normal_ct_phyl_dist)

normal_anno$Cell_Type_colors = color_codes$Cell_Type[as.character(normal_anno[colnames(normal_ct_phyl_data), "Cell_Type"])]
par(mar = c(0, 1, 0, 1))
plot_tree(tree = traj_ip_normal_dist_nj, col_df = normal_anno[,c("Cell_Type_colors"), drop = FALSE], col_id = 1, type = 'unrooted', rotate = 90, cex = 0.4)

#legend(x = "bottom", legend = names(color_codes$Cell_Type[1:6]), col = color_codes$Cell_Type[1:6], ncol = 3, cex = 0.6)

Phylogenetic trees with random set of probes

#Random set-1
set.seed(seed = 1024)
rand_probes_1 = sample(x = rownames(normal_data)[!rownames(normal_data) %in% normal_ct_res_probes_uniq], size = length(normal_ct_res_probes_uniq), replace = FALSE)
normal_ct_phyl_data_rand1 = normal_data[rand_probes_1,]
dim(normal_ct_phyl_data_rand1)
## Features  Samples 
##    23069       12
normal_ct_phyl_data_rand1_dist = stats::dist(t(exprs(normal_ct_phyl_data_rand1)), method = "man")
normal_ct_phyl_data_rand1_nj = ape::nj(X = normal_ct_phyl_data_rand1_dist)

#Random set-2
set.seed(seed = 1024)
rand_probes_2 = sample(x = rownames(normal_data)[!rownames(normal_data) %in% normal_ct_res_probes_uniq], size = length(normal_ct_res_probes_uniq)*2, replace = FALSE)
normal_ct_phyl_data_rand2 = normal_data[rand_probes_2,]
dim(normal_ct_phyl_data_rand2)
## Features  Samples 
##    46138       12
normal_ct_phyl_data_rand2_dist = stats::dist(t(exprs(normal_ct_phyl_data_rand2)), method = "man")
normal_ct_phyl_data_rand2_nj = ape::nj(X = normal_ct_phyl_data_rand2_dist)
par(mar = c(0, 0, 0, 0), mfrow = c(1, 2))
plot_tree(tree = normal_ct_phyl_data_rand1_nj, col_df = normal_anno[,c("Cell_Type_colors"), drop = FALSE], col_id = 1, type = 'unrooted', cex = 0.4)
plot_tree(tree = normal_ct_phyl_data_rand2_nj, col_df = normal_anno[,c("Cell_Type_colors"), drop = FALSE], col_id = 1, type = 'unrooted', rotate = 270, cex = 0.4)

Effects of thymic DMPs on PCA with tumor samples

non_thymic_probes = rownames(normal_data)[!rownames(normal_data) %in% normal_ct_res_probes_uniq]
non_thymic_probes = intersect(non_thymic_probes, rownames(normal_data))
non_thymic_probes = intersect(non_thymic_probes, rownames(tumor_data))

complete_data_minus_t_cell_probes = Biobase::combine(normal_data[non_thymic_probes,], tumor_data[non_thymic_probes,])
complete_data_minus_t_cell_probes
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 756324 features, 155 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: sam_CD34_1 sam_CD34_2 ... sample_RAD_MI (155 total)
##   varLabels: Sample_Name Age ... Maturation_Arrest_Stage_simple (53
##     total)
##   varMetadata: labelDescription
## featureData
##   featureNames: cg26928153 cg16269199 ... cg02995750 (756324 total)
##   fvarLabels: rn Chr ... Relation_to_Island (12 total)
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
com_probes = intersect(rownames(normal_data), rownames(tumor_data))

complete_data_plus_t_cell_probes = cbind(exprs(normal_data[com_probes,]), exprs(tumor_data[com_probes,]))
complete_data_plus_t_cell_probes = Biobase::ExpressionSet(assayData = as.matrix(complete_data_plus_t_cell_probes))
complete_data_plus_t_cell_probes$Cell_Type = complete_data_minus_t_cell_probes$Cell_Type
complete_data_plus_t_cell_probes
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 779393 features, 155 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: sam_CD34_1 sam_CD34_2 ... sample_RAD_MI (155 total)
##   varLabels: Cell_Type
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
par(mfrow = c(1, 2))
pcaMset(complete_data_minus_t_cell_probes, phenoCol="Cell_Type", phenoColors = color_codes$Cell_Type, legendNcol = 2)
pcaMset(complete_data_plus_t_cell_probes, phenoCol="Cell_Type", phenoColors = color_codes$Cell_Type, legendNcol = 2)

rm(complete_data_minus_t_cell_probes, complete_data_plus_t_cell_probes)
gc()
##             used   (Mb) gc trigger   (Mb)   max used    (Mb)
## Ncells  10749205  574.1   30246528 1615.4   46783552  2498.6
## Vcells 296853472 2264.9 1110396454 8471.7 1387691599 10587.3

Maturation arrest stages

PCA

complete_data_all = Biobase::combine(tumor_data[rownames(normal_ct_phyl_data),], normal_ct_phyl_data)
complete_data_all$Cell_Type2 = c(as.character(complete_data_all$Cluster[1:143]), complete_data_all$Cell_Type[144:155])
print(dim(complete_data_all))
## Features  Samples 
##    23069      155
complete_data_all_pca = prcomp(t(exprs(complete_data_all)))
complete_data_all_pca_dat = as.data.frame(complete_data_all_pca$x[,c("PC1", "PC2", "PC3", "PC4"), drop= FALSE])
var_explained_all = complete_data_all_pca$sdev^2/sum(complete_data_all_pca$sdev^2)

complete_data_all_pca_dat = cbind(pData(complete_data_all)[rownames(complete_data_all_pca_dat),], complete_data_all_pca_dat)
complete_data_all_pca_dat$color = c(clust_cols, color_codes$Cell_Type)[complete_data_all_pca_dat$Cell_Type2]

temp_nt = complete_data_all_pca_dat[complete_data_all_pca_dat$Tissue == "Normal_Thymus", ]
temp_t = complete_data_all_pca_dat[complete_data_all_pca_dat$Tissue != "Normal_Thymus", ]
temp_t$color = grDevices::adjustcolor(col = temp_t$color, alpha.f = 0.4)
par(mar = c(2, 2, 1, 1))
plot(NA, axes = FALSE, xlab = NA, ylab = NA, cex = 1.2, xlim = range(pretty(complete_data_all_pca_dat$PC1)), ylim = range(pretty(complete_data_all_pca_dat$PC2)))
abline(h = pretty(complete_data_all_pca_dat$PC2), v = pretty(complete_data_all_pca_dat$PC1), col = grid_cols, lty = 2)

points(x = temp_nt$PC1, y = temp_nt$PC2, bg = temp_nt$color, pch = 23, cex = 1)
points(x = temp_t$PC1, y = temp_t$PC2, col = temp_t$color, pch = 19, cex = 0.6)

axis(side = 1, at = pretty(complete_data_all_pca_dat$PC1), lwd = 1, font = 1, cex.axis = 0.6)
mtext(text = paste0("PC2 [", round(var_explained_all[2], digits = 2), "]"), side = 2, cex = 0.6)
mtext(text = paste0("PC1 [", round(var_explained_all[1], digits = 2), "]"), side = 1, cex = 0.6)
axis(side = 2, at = pretty(complete_data_all_pca_dat$PC2), lwd = 1, font = 1, las = 2, cex.axis = 0.6)
legend(x = "topright", legend = names(clust_cols)[1:5], col = grDevices::adjustcolor(col = clust_cols[1:5], alpha.f = 0.4), pch = 19, pt.cex = 0.6, cex = 0.6)
legend(x = "topleft", legend = names(color_codes$Cell_Type[1:6]), pt.bg = color_codes$Cell_Type[1:6], pch = 23, pt.cex = 0.6, cex = 0.6)

Phylogenetic tree

All tumor samples

normal_ct_phyl_data = order_by_sds(mat = normal_ct_phyl_data)
msa_dat = exprs(complete_data_all[rownames(normal_ct_phyl_data)[1:2500],])
dim(msa_dat)
## [1] 2500  155
traj_ip_dist = stats::dist(t(msa_dat), method = "man")
traj_ip_dist_nj = ape::nj(X = traj_ip_dist)
#Color codes
complete_data_all_pca_dat$tree_cols = ifelse(test = complete_data_all_pca_dat$Tissue == "Normal_Thymus", yes = "black", no = complete_data_all_pca_dat$color)
par(mar = c(0, 0, 0, 0))
plot_tree(
  tree = traj_ip_dist_nj,
  col_df = complete_data_all_pca_dat,
  col_id = which(x = colnames(complete_data_all_pca_dat) == "tree_cols"),
  type = 'un',
  cex = 0.1,
  point_size = 0.6,
  edge.width = 0.6,
  rotate.tree = 50
)

Averaged tumor samples

msa_dat = as.data.frame(msa_dat)
data.table::setDT(x = msa_dat, keep.rownames = TRUE)
msa_dat_mlt = data.table::melt(data = msa_dat)
## Warning in melt.data.table(data = msa_dat): id.vars and measure.vars are
## internally guessed when both are 'NULL'. All non-numeric/integer/logical type
## columns are considered id.vars, which in this case are columns [rn, ...].
## Consider providing at least one of 'id' or 'measure' vars in future.
#data.table::setDT(x = tree_anno, keep.rownames = TRUE)
msa_dat_mlt = merge(msa_dat_mlt, complete_data_all_pca_dat[,c("Sample_Name", "Cell_Type2", "Cluster")], by.y = 'Sample_Name', by.x = 'variable')
msa_dat_mlt$Cluster = ifelse(test = is.na(as.character(msa_dat_mlt$Cluster)), yes = "Normal", no = as.character(msa_dat_mlt$Cluster))
msa_dat_mlt_normal = msa_dat_mlt[Cluster %in% "Normal"]
msa_dat_mlt_tumor = msa_dat_mlt[!Cluster %in% "Normal"]
msa_dat_mlt_tumor_avg = msa_dat_mlt_tumor[,mean(value), .(Cluster, rn)]
msa_dat_mlt_tumor_avg = data.table::dcast(data = msa_dat_mlt_tumor_avg, rn ~ Cluster, value.var = 'V1')
msa_dat_mlt_normal = data.table::dcast(data = msa_dat_mlt_normal, rn ~ variable, value.var = 'value')
msa_dat_tum_normal = merge(msa_dat_mlt_tumor_avg, msa_dat_mlt_normal)
data.table::setDF(x = msa_dat_tum_normal, rownames = msa_dat_tum_normal$rn)
msa_dat_tum_normal = msa_dat_tum_normal[,-1]

temp_coldf = data.frame(
  row.names = colnames(msa_dat_tum_normal),
  samp_type = gsub(
    pattern = "sam_|_1|_2",
    replacement = "",
    x = colnames(msa_dat_tum_normal)
  ),
  stringsAsFactors = FALSE
)
temp_coldf$color = c(color_codes$Cluster[1:5], color_codes$Cell_Type[1:6])[temp_coldf$samp_type]
temp_coldf$pch = ifelse(test = temp_coldf$samp_type %in% c("C1", "C2", "C3", "C4", "C5"), yes = 19, no = 23)

#NJ
msa_dat_tum_normal_dist = stats::dist(t(msa_dat_tum_normal), method = "man")
msa_dat_tum_normal_nj = ape::nj(X = msa_dat_tum_normal_dist)
#layout(mat = matrix(c(1, 2), nrow = 2, byrow = TRUE), heights = c(3, 1))
par(mar = c(1, 1, 1, 1))
plot_tree(tree = msa_dat_tum_normal_nj, col_df = temp_coldf, col_id = 2, type = 'un', cex = 0.6, point_size = 1, rotate.tree = 70, pch_id = 3)

Thymopoiesis: DNAm and RNA

RNA analysis

RNA-seq data for normal thymic cell types are obtained from published dataset GSE151079. Data analysis was done using similar cell types for which DNAm EPIC arrays were generated.

normal_rna = data.table::fread(input = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE151nnn/GSE151079/suppl/GSE151079_RNAseq_thymocytes_raw_counts.txt.gz")
colnames(normal_rna) = gsub(pattern = " R1", replacement = "_R1", x = colnames(normal_rna))
colnames(normal_rna) = gsub(pattern = " R2", replacement = "_R2", x = colnames(normal_rna))
colnames(normal_rna) = gsub(pattern = "\\+", replacement = "p", x = colnames(normal_rna))
colnames(normal_rna) = gsub(pattern = "\\-", replacement = "n", x = colnames(normal_rna))
colnames(normal_rna) = gsub(pattern = " ", replacement = "_", x = colnames(normal_rna))
#normal_rna[,.N,biotype]
normal_rna = normal_rna[biotype %in% "protein_coding"]
coldata = data.frame(row.names = colnames(normal_rna)[2:23], Sample_Name = colnames(normal_rna)[2:23], Cell_type = gsub(pattern = "_R1|_R2", replacement = "", x = colnames(normal_rna)[2:23]))
gene_anno = normal_rna[,.(V1, seqname, seqstart, seqend, biotype, seqstrand, symbol)]

To compare with the DNAm we should use similar cell types:

  • CD34+CD1-
  • ISP CD28+
  • DP CD3-
  • DP CD3+
  • SP CD4+
  • SP CD8+
coldata = coldata[coldata$Cell_type %in% c("CD34pCD1n", "ISP_CD28p", "DP_CD3n", "DP_CD3p", "SP_CD4p", 'SP_CD8p'),]
coldata = coldata[order(coldata$Cell_type),]

data.table::setDF(x = normal_rna, rownames = normal_rna$V1)
normal_rna = normal_rna[,rownames(coldata)]

if(all(rownames(coldata) != colnames(normal_rna))){
    stop("Order mismatch!")
}

#gene_anno = data.frame(row.names = gene_anno$V1, ens_id = gene_anno$V1, Hugo_Symbol = gene_anno$symbol)
data.table::setDF(x = gene_anno, rownames = gene_anno$V1)
gene_anno = gene_anno[rownames(normal_rna),]
gene_anno = gene_anno[!duplicated(gene_anno$symbol),]
gene_anno = gene_anno[complete.cases(gene_anno),]
normal_rna = normal_rna[rownames(gene_anno),]

if(!all(rownames(gene_anno) == rownames(normal_rna))){
    stop("Order mismatch!")
}

rownames(normal_rna) = gene_anno$symbol
#Done once and results are stored for re-use
dds = DESeq2::DESeqDataSetFromMatrix(countData = normal_rna, colData = coldata, design = ~Cell_type)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = DESeq2::DESeq(object = dds, parallel = TRUE)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 8 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 8 workers
#saveRDS(object = dds, file = paste0(res_dir, "DDS_PC_genes_normalized.RDs"), version = 2)
vsd = DESeq2::vst(object = dds, blind = FALSE)
#saveRDS(object = vsd, file = paste0(res_dir, "VSD_PC_genes.RDs"), version = 2)

vsd_log_mat = order_by_sds(mat = as.data.frame(assay(vsd)))

ct_cols = color_codes$Cell_Type[1:6]
names(ct_cols) = c("CD34pCD1n", "ISP_CD28p", "DP_CD3n", "DP_CD3p", "SP_CD4p", "SP_CD8p")

Expression based trajectory analysis

traj_ip_dist = stats::dist(t(vsd_log_mat[1:1000,]), method = "man")
traj_ip_dist_nj = ape::nj(X = traj_ip_dist)
coldata$color = ct_cols[coldata$Cell_type]
par(mar = c(0, 0, 0, 0))
par(mar = c(0, 0, 0, 0))
plot_tree(tree = traj_ip_dist_nj, col_df = coldata, col_id = which(x = colnames(coldata) == "color"), type = 'un',
    cex = 0.1, point_size = 1.2, edge.width = 0.6, rotate.tree = 130)
legend(x = "topright", legend = names(ct_cols), col = ct_cols, pch = 19, bty = "n", ncol = 2, cex = 0.7)

DEGs (Each cell type vs CD34)

thymic_ct_temp = names(ct_cols)[2:6]
thymic_results = lapply(thymic_ct_temp, function(ct){
  print(paste0("Comparing: ", ct, " vs. CD34pCD1n"))
  cd_temp = coldata[coldata$Cell_type %in% c("CD34pCD1n", ct),, drop = FALSE]
  counts_temp = normal_rna[,rownames(cd_temp)]
  dds_ct = DESeq2::DESeqDataSetFromMatrix(countData = counts_temp, colData = cd_temp, design = ~ Cell_type)
  dds_ct = DESeq2::DESeq(object = dds_ct, parallel = TRUE)
  rds = DESeq2::results(object = dds_ct, contrast = c("Cell_type", ct, "CD34pCD1n"), name = ct, lfcThreshold = 0.6)
  rds = data.table::as.data.table((as.data.frame(rds)), keep.rownames = TRUE)
  rds[order(padj)]
})
## [1] "Comparing: ISP_CD28p vs. CD34pCD1n"
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 8 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 8 workers
## [1] "Comparing: DP_CD3n vs. CD34pCD1n"
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 8 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 8 workers
## [1] "Comparing: DP_CD3p vs. CD34pCD1n"
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 8 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 8 workers
## [1] "Comparing: SP_CD4p vs. CD34pCD1n"
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 8 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 8 workers
## [1] "Comparing: SP_CD8p vs. CD34pCD1n"
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 8 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 8 workers
names(thymic_results) = thymic_ct_temp
thymic_deg_table = data.frame(
  down = sapply(thymic_results, function(x)
    nrow(x[padj < 0.1][log2FoldChange < 0])),
  up = sapply(thymic_results, function(x)
    nrow(x[padj < 0.1][log2FoldChange > 0]))
)

print(thymic_deg_table)
##           down   up
## ISP_CD28p  751  494
## DP_CD3n   1417 1053
## DP_CD3p   1548 1398
## SP_CD4p   1558 1713
## SP_CD8p   1322 1488
thymic_degs = unique(as.character(unlist(lapply(thymic_results, function(x) x[padj < 0.1, rn]))))

Promoter methylation

tss_probes = data.table::fread(input = "data/extdata/EPIC_TSS_probes.bed") #EPIC probes with 1.5kb of TSS
temp_normal_meth = as.data.table(exprs(normal_data), keep.rownames = TRUE)
temp_normal_meth = merge(tss_probes[,.(V4, V9)], temp_normal_meth, by.x = 'V9', by.y = 'rn') #Change V4 -> V5 for avg. methylation per tx
temp_normal_meth[,V9 := NULL]
normal_pm = temp_normal_meth[,lapply(.SD, mean, na.rm = TRUE), by = V4]
data.table::setDF(x = normal_pm, rownames = normal_pm$V4)
normal_pm$V4 = NULL

normal_pm = normal_pm[,c("sam_CD34_1", "sam_CD34_2", "sam_ISP_1", "sam_ISP_2", "sam_TCRn_1", "sam_TCRn_2", "sam_TCRp_1", "sam_TCRp_2", "sam_SPo4_1", "sam_SPo4_2", "sam_SPo8_1", "sam_SPo8_2")]

rm(temp_normal_meth, tss_probes)

Identify DM Promoters per cell-type

normal_anno = data.frame(row.names = colnames(normal_pm), Cell_Type = rep(c("CD34", "ISP", "TCRn", "TCRp", "SPo4", "SPo8"), each = 2))
normal_anno$Cell_Type = factor(
  x = normal_anno$Cell_Type,
  levels = c("CD34", "ISP", "TCRn", "TCRp", "SPo4", "SPo8")
)

design = model.matrix(~0+normal_anno$Cell_Type)
colnames(design) = levels(normal_anno$Cell_Type)

fit = limma::lmFit(object = normal_pm, design = design)

contrast_matrix =
  limma::makeContrasts(ISP - CD34, TCRn - CD34, TCRp - CD34, SPo4 - CD34, SPo8 - CD34,
                       levels = design)

fit2 = limma::contrasts.fit(fit = fit, contrasts = contrast_matrix)
fit2 = limma::eBayes(fit = fit2)

normal_ct_res = lapply(1:5, function(i){
  x = limma::topTable(fit2,
  coef = i,
  adjust = "BH",
  number = "all")
  data.table::setDT(x = x, keep.rownames = TRUE)
})
names(normal_ct_res) = c("ISP", "TCRn", "TCRp", "SPo4", "SPo8")
thymic_dmp_table = data.frame(
  Hypo = sapply(normal_ct_res, function(x)
    nrow(x[P.Value < 0.01][logFC < 0])),
  Hyper = sapply(normal_ct_res, function(x)
    nrow(x[P.Value < 0.01][logFC > 0]))
)

print(thymic_dmp_table)
##      Hypo Hyper
## ISP   184   127
## TCRn  531   289
## TCRp  829   286
## SPo4  766  1446
## SPo8  686  1802
thymic_dmps = unique(as.character(unlist(lapply(normal_ct_res, function(x) x[P.Value < 0.01, rn]))))

Barplot of DEG and DMP

thymic_deg_table = thymic_deg_table[c("ISP_CD28p", "DP_CD3n", "DP_CD3p", "SP_CD4p", "SP_CD8p"),]

par(mar = c(2, 2, 3, 2))
b = barplot(t(thymic_deg_table), horiz = TRUE, xlim = c(-2500, 3300), col = c("#807DBA", "#F16913"), names.arg = rep(NA, 5), axes = FALSE)
barplot(-t(thymic_dmp_table), horiz = TRUE, add = TRUE, col = c("#807DBA", "#F16913"), names.arg = rep(NA, 5), axes = FALSE)
legend(x = -2000, y = 8, legend = c("Hypo", "Hyper"), col = c("#807DBA", "#F16913"), pch = 15, xpd = TRUE, bty = "n", title = "Methylation", cex = 0.8, ncol = 2)
legend(x = 2000, y = 8, legend = c("Down", "Up"), col = c("#807DBA", "#F16913"), pch = 15, xpd = TRUE, bty = "n", title = "Expression", cex = 0.8, ncol = 2)
axis(side = 1, at = seq(-3000, 3000, 500), labels = abs(seq(-3000, 3000, 500)), cex.axis = 0.8)
mtext(text = c("ISP", "TCRn", "TCRp", "SP4", "SP8"), side = 2, at = b, las =2, cex = 0.8)

getAvgMeth = function(m, g, scale = TRUE){
    mg = sapply(g, function(gi){
        sapply(seq(1, 12, 2), function(i){
        mean(m[gi,][c(i, i+1)])
    })
    })
    
    if(scale){
        mg = apply(mg, 2, scale)
    }
    
    rownames(mg) = c("CD34", "ISP", "TCRn", "TCRp", "SP4", "SP8")
    mg
}

plotLineMeth = function(x){
    #par(mfrow = c(1, ncol(x)), mar = c(4, 3, 1, 1))
    par(mar = c(4, 3, 1, 1))
    
    for(i in 1:ncol(x)){
        plot(x[,i], type = 'l', axes = FALSE, ylim = range(pretty(x[,i])), xlab = NA)
        axis(side = 1, at = 1:nrow(x), labels = rownames(x), las = 2)
        axis(side = 2, at = pretty(x[,i]), labels = pretty(x[,i]), line = 0.5, las = 2)
        points(x[,i])
        title(colnames(x)[i])
    }
}
#Order according to dev. stages
vsd_log_mat = vsd_log_mat[,c("CD34pCD1n_R1", "CD34pCD1n_R2", "ISP_CD28p_R1", "ISP_CD28p_R2", "DP_CD3n_R1", "DP_CD3n_R2", "DP_CD3p_R1", "DP_CD3p_R2", "SP_CD4p_R1", "SP_CD4p_R2", "SP_CD8p_R1", "SP_CD8p_R2")]
com_genes = intersect(rownames(normal_pm), rownames(normal_rna))
length(com_genes)
## [1] 17324

Draw DNAm and expression dynamics for few known marker genes

mark_meth = getAvgMeth(m = as.matrix(normal_pm), g = c("CD34", "SPI1", "NOTCH1", "TCF7", "CD27", "CD8A"), scale = TRUE)
mark_exp = getAvgMeth(m = as.matrix(vsd_log_mat), g = c("CD34", "SPI1", "NOTCH1", "TCF7", "CD27", "CD8A"), scale = TRUE)

par(mfrow = c(2, ncol(mark_meth)))
plotLineMeth(x = mark_exp)
plotLineMeth(x = mark_meth)

Genes with both DM promoter and DM expressions

isp_com_dg = intersect(thymic_results$ISP_CD28p[padj < 0.1, rn], normal_ct_res$ISP[P.Value < 0.01, rn])
tcrn_com_dg = intersect(thymic_results$DP_CD3n[padj < 0.1, rn], normal_ct_res$TCRn[P.Value < 0.01, rn])
tcrp_com_dg = intersect(thymic_results$DP_CD3p[padj < 0.1, rn], normal_ct_res$TCRp[P.Value < 0.01, rn])
sp4_com_dg = intersect(thymic_results$SP_CD4p[padj < 0.1, rn], normal_ct_res$SPo4[P.Value < 0.01, rn])
sp8_com_dg = intersect(thymic_results$SP_CD8p[padj < 0.1, rn], normal_ct_res$SPo8[P.Value < 0.01, rn])

lapply(list(isp_com_dg, tcrn_com_dg, tcrp_com_dg, sp4_com_dg, sp8_com_dg), length)
## [[1]]
## [1] 44
## 
## [[2]]
## [1] 145
## 
## [[3]]
## [1] 235
## 
## [[4]]
## [1] 588
## 
## [[5]]
## [1] 626
udgs = sort(unique(c(isp_com_dg, tcrn_com_dg, tcrp_com_dg, sp4_com_dg, sp8_com_dg)))

length(udgs)
## [1] 942
writexl::write_xlsx(x = data.table::data.table(Gene = udgs), path = "Supplementary_table_S4.xlsx", col_names = TRUE, format_headers = TRUE)
udgs_m = t(getAvgMeth(m = as.matrix(normal_pm), g = udgs, scale = FALSE))
udgs_e = t(getAvgMeth(m = as.matrix(vsd_log_mat), g = udgs, scale = FALSE))
par(mar = c(4, 4, 1, 1))
plot(NA, xlim = c(1,6), ylim = c(-1, 0.1), axes = FALSE, xlab = NA, ylab = NA)
axis(side = 1, at = 1:6, labels = colnames(udgs_m), las = 2)
axis(side = 2, at = seq(-1, 0, 0.2), las = 2)
abline(h = seq(-1, 0, 0.2), v = 1:5, lty = 2, col = 'gray80')

for(i in 1:6){
    xcor = cor.test(udgs_m[,i ], udgs_e[,i])
    points(x = i , y = xcor$estimate, pch = 19)
    segments(x0 = i, y0 = xcor$conf.int[1], x1 = i, y1 = xcor$conf.int[2], lwd = 1)
    text(x = i, y = 0.1, labels = ifelse(xcor$p.value < 0.001, yes = "***", no = "ns"), xpd = TRUE)
}
mtext(text = "Pearson correlation", side = 2, line = 2.5)

GO for common promoter methylated and Differentially expressed gene

udgs_go = goseq_wrapper(assayedGenes = rownames(vsd_log_mat), deGenes = udgs, source_id = "geneSymbol")
par(mar = c(3, 1, 1, 2))
x = udgs_go[1:20]
x[, log_pvalue := -log10(over_represented_pvalue)]
x_lims = ceiling(max(x[, log_pvalue]))
plot(
    NA,
    xlim = c(0, x_lims),
    ylim = c(0, nrow(x)),
    axes = FALSE,
    xlab = NA,
    ylab = NA
)
#abline(v = seq(0, x_lims, by = 1), col = 'gray90', lty = 2)
for (j in 1:nrow(x)) {
    rect(
        xleft = x[j, log_pvalue],
        ybottom = j - 1,
        xright = 0,
        ytop = j,
        col = grDevices::adjustcolor(col = "black", alpha.f = 0.2),
        border = "white"
    )
}
axis(side = 1, at = pretty(c(0, x_lims)))
abline(v = 2, lty = 2, col = 'maroon')
#title(main = names(down_go)[i], adj = 0, font.main = 4)
text(
    x = rep(0, nrow(x)),
    y = seq(0.5, nrow(x) - 0.5, 1),
    labels = x[, term],
    adj = 0,
    xpd = TRUE,
    col = 'black',
    font = 1, cex = 0.7
)

dnam_ct = c("CD34", "ISP", "TCRn", "TCRp", "SP4", "SP8")
dnam_ct_cols = ct_cols
names(dnam_ct_cols) = dnam_ct

x = pheatmap::pheatmap(mat = normal_pm[udgs, ], scale = "row", show_rownames = FALSE, width = 4, height = 5, cluster_cols = FALSE, annotation = data.frame(row.names = colnames(normal_pm), Cell_type = rep(dnam_ct, each = 2)), annotation_colors = list(Cell_type = dnam_ct_cols), show_colnames = FALSE, annotation_legend = TRUE)

pheatmap::pheatmap(vsd_log_mat[rownames(normal_pm[udgs, ][x$tree_row$order,]), ], cluster_rows = FALSE, scale = 'row', show_rownames = FALSE, width = 3, height = 5, cluster_cols = FALSE, annotation = coldata[,c("Cell_type"), drop = FALSE], annotation_colors = list(Cell_type = ct_cols), show_colnames = FALSE, annotation_legend = FALSE)

Clusters vs/ Normals - Differential Methylation

Limma

com_probes = intersect(rownames(normal_data), rownames(tumor_data))
meth_exprs = Biobase::combine(tumor_data[com_probes,], normal_data[com_probes,])
dim(meth_exprs)
## Features  Samples 
##   779393      155
#Remove Thymic DMPs from analysis
meth_exprs = meth_exprs[rownames(meth_exprs)[!rownames(meth_exprs) %in% normal_ct_res_probes_uniq], ]
print(dim(meth_exprs))
## Features  Samples 
##   756324      155
meth_exprs$Cluster = ifelse(is.na(meth_exprs$Cluster), "Normal", as.character(meth_exprs$Cluster))

cell_type <- as.factor(meth_exprs$Cluster)
design <- model.matrix(~0+cell_type)
colnames(design) <- levels(cell_type)

fit <- limma::lmFit(object = exprs(meth_exprs), design = design)

contrast_matrix <- limma::makeContrasts(C1-Normal, C2-Normal, C3-Normal, C4-Normal, C5-Normal, levels = design)
fit2 <- limma::contrasts.fit(fit = fit, contrasts = contrast_matrix)
fit2 <- limma::eBayes(fit = fit2)
dmps = lapply(1:5, function(k) {
  k_dmps = limma::topTable(fit2,
  coef = k,
  adjust = "BH",
  number = "all")
  data.table::setDT(k_dmps, keep.rownames = TRUE)
  k_dmps = merge(k_dmps, epic_anno, all.x = TRUE)
  k_dmps
})

names(dmps) = paste0("C", 1:5)

sup_table_s5 = lapply(dmps, function(x){
  x[adj.P.Val < 0.05][abs(logFC) > 0.20]
})
writexl::write_xlsx(x = sup_table_s5, path = "Supplementary_table_S5.xlsx", col_names = TRUE, format_headers = TRUE)
#DMP definition: FDR < 0.05 and |meth| > 20%
dmp_tbl = lapply(dmps, function(x) {
  x = x[adj.P.Val < 0.05][abs(logFC) > 0.20]
  data.frame(table(ifelse(
  x$logFC > 0, yes = "Hyper", no = "Hypo"
  )))
})

dmp_tbl = data.table::rbindlist(l = dmp_tbl, idcol = "Cluster")
dmp_tbl[,freq := Freq / sum(Freq), .(Cluster)]

#DMPs identified
print(data.table::dcast(data = dmp_tbl, Cluster ~ Var1, value.var = 'freq'))
##    Cluster     Hyper      Hypo
## 1:      C1 0.6159886 0.3840114
## 2:      C2 0.6351961 0.3648039
## 3:      C3 0.7749324 0.2250676
## 4:      C4 0.7841419 0.2158581
## 5:      C5 0.8739028 0.1260972
print(data.table::dcast(data = dmp_tbl, Cluster ~ Var1, value.var = 'Freq'))
##    Cluster  Hyper  Hypo
## 1:      C1  34913 21765
## 2:      C2  49161 28234
## 3:      C3  77921 22631
## 4:      C4  98529 27123
## 5:      C5 102452 14783
dmp_tbl$Freq_2 = ifelse(test = dmp_tbl$Var1 == "Hypo", yes = -1*as.numeric(dmp_tbl$Freq), dmp_tbl$Freq)
dmp_tbl$freq_2 = ifelse(test = dmp_tbl$Var1 == "Hypo", yes = -1*as.numeric(dmp_tbl$freq), dmp_tbl$freq)

#Plot raw numbers
temp_x_lim = range(dmp_tbl$Freq_2)
dmp_tbl = split(dmp_tbl, as.factor(dmp_tbl$Cluster))
par(mar = c(2, 2, 1, 1))
plot(NA, NA, xlim = range(pretty(temp_x_lim)), ylim = c(0, 5.5), axes = FALSE, xlab = NA, ylab = NA)
abline(h = seq(0.5, 4.5, 1), v = pretty(temp_x_lim), col = grDevices::adjustcolor(col = 'gray70', alpha.f = 0.4))
for(i in 1:length(dmp_tbl)){
  temp_x = dmp_tbl[[i]]
  rect(xleft = 0, ybottom = i-1, xright = temp_x[Var1 %in% "Hyper", Freq_2], ytop = i, col = "#F16913", border = "white", lwd = 1.2)
  rect(xleft = 0, ybottom = i-1, xright = temp_x[Var1 %in% "Hypo", Freq_2], ytop = i, col = "#807DBA", border = "white", lwd = 1.2)
}
axis(side = 1, at = pretty(temp_x_lim), labels = abs(pretty(temp_x_lim)), cex.axis = 0.5)
axis(side = 2, at = seq(0.5, 4.5, 1), labels = names(dmp_tbl), las = 2, tick = FALSE, line = -1)
text(x = 2500, y = 5.2, labels = "Hyper", adj = 0, col = "#F16913", font = 4)
text(x = -1500, y = 5.2, labels = "Hypo", adj = 1, col = "#807DBA", font = 4)

DMP Annotations

#Genomic annotation
anno_stats = lapply(dmps, function(x){
  x$DM = ifelse(x$logFC > 0 , yes = 'Hyper', no = 'Hypo')
  x = x[,.N,.(DM, Annotation)]
  x[,fract := N/sum(N), .(DM)]
  x
})

anno_stats = data.table::rbindlist(l = anno_stats, idcol = 'Cluster')
anno_stats = data.table::dcast(data = anno_stats, Cluster ~ Annotation+DM, value.var = 'fract')
data.table::setDF(x = anno_stats, rownames = anno_stats$Cluster)
anno_stats = anno_stats[,-1]
anno_stats_hypo = t(anno_stats[,-grep(pattern = "Hyper", x = colnames(anno_stats))])
anno_stats_hyper = t(anno_stats[,grep(pattern = "Hyper", x = colnames(anno_stats))])

#CpG island anno
island_stats = lapply(dmps, function(x){
  x$DM = ifelse(x$logFC > 0 , yes = 'Hyper', no = 'Hypo')
  x = x[,.N,.(DM, Relation_to_Island)]
  x[,fract := N/sum(N), .(DM)]
  x
})
island_stats = data.table::rbindlist(l = island_stats, idcol = 'Cluster')
island_stats = data.table::dcast(data = island_stats, Cluster ~ Relation_to_Island+DM, value.var = 'fract')
data.table::setDF(x = island_stats, rownames = island_stats$Cluster)
island_stats = island_stats[,-1]
island_stats_hypo = t(island_stats[,-grep(pattern = "Hyper", x = colnames(island_stats))])
island_stats_hyper = t(island_stats[,grep(pattern = "Hyper", x = colnames(island_stats))])
layout(mat = matrix(c(1, 2, 3, 4), nrow = 2, byrow = TRUE), widths = c(1, 1.2), heights = c(0.95, 1))
par(mar = c(1, 3, 1, 1), las = 2)
barplot(height = anno_stats_hyper, col = RColorBrewer::brewer.pal(n = 8, name = 'Accent'), border = 'black', horiz = TRUE, xlim = c(0, 1), axes = FALSE)
barplot(height = anno_stats_hypo, col = RColorBrewer::brewer.pal(n = 8, name = 'Accent'), border = 'black', horiz = TRUE, xlim = c(0, 1.5), axes = FALSE)
legend(x = "topright", col = RColorBrewer::brewer.pal(n = 8, name = 'Accent'), legend = gsub(pattern = "_Hypo", replacement = "", x = rownames(anno_stats_hypo)), pch = 15, cex = 0.5)

par(mar = c(3, 3, 0, 1))
barplot(height = island_stats_hyper, col = RColorBrewer::brewer.pal(n = 8, name = 'Accent'), border = 'black', horiz = TRUE, xlim = c(0, 1), axes = FALSE)
axis(side = 1, at = seq(0, 1, 0.2))
barplot(height = island_stats_hypo, col = RColorBrewer::brewer.pal(n = 8, name = 'Accent'), border = 'black', horiz = TRUE, xlim = c(0, 1.5), axes = FALSE)
legend(x = "topright", col = RColorBrewer::brewer.pal(n = 8, name = 'Accent'), legend = gsub(pattern = "_Hyper", replacement = "", x = rownames(island_stats_hyper)), pch = 15, cex = 0.6)
axis(side = 1, at = seq(0, 1, 0.2), cex = 0.6)

Enhancer landscapes (BLUEPRINT)

T-ALL samples 12 T-ALL samples are analyzed, generated as a part of BLUEPRINT cohort. Steps involved in processing: (MACS2 peakcalling -> peak filtering -> SE calling) with chiptk. Super and Typical enhancers are identified using ROSE.

Normal Thymic cell types 5 distinct thymic cell types from healthy thymus were analyzed, generated as a part of BLUEPRINT cohort.

  • CD34: CD34 positive
  • EC: CD3negative CD4positive CD8positive DP thymocytes
  • LC: CD3positive CD4positive CD8positive DP thymocytes
  • SP4: CD4positive alpha-beta thymocyte
  • SP8: CD8positive alpha-beta thymocyte

Following analysis are performed:

  1. Call peaks for all cell types for histone marks H3K27ac, H3K4me1, and H3K4me3
  2. Merge peaks from same histone marks to create consensus peakset
  3. Call Super-Eahancers from H3K27ac
  4. Divide regulatory regions into:
Regulatory region H3K4me1 H3K27ac H3K4me3
Active enhancers Yes Yes NA
Poised enhancers Yes No NA
Active promoters NA NA Yes
Super enhancers NA Yes (12kb stitched) NA

H3K27ac data

#SuperEnhancers
se_files = list.files(path = "data/BLUEPRINT/H3K27ac/SE/", pattern = '_AllEnhancers_ENHANCER_TO_GENE.txt', all.files = TRUE, recursive = TRUE, full.names = TRUE)

#K27ac narropeaks
k27ac_files = list.files(path = "data/BLUEPRINT/H3K27ac/narrowPeak/", pattern = '*\\.narrowPeak$', all.files = TRUE, recursive = TRUE, full.names = TRUE)

#bigWig files
bw_files = list.files(path = "data/BLUEPRINT/H3K27ac/bigWigs/", pattern = '_treat_pileup.bw', all.files = TRUE, recursive = TRUE, full.names = TRUE)

k27ac_pdata = data.frame(se_files, bw_files, narrowpeaks = k27ac_files, stringsAsFactors = FALSE)
k27ac_pdata$Sample_Name = paste0(
  "sample_",
  gsub(
    pattern = "_H3K27ac_treat_pileup.bw|b|_treat_pileup.bw",
    replacement = "",
    x = basename(path = k27ac_pdata$bw_files)
  )
)

#T13C is sample649
k27ac_pdata$Sample_Name = gsub(pattern = "T13C", replacement = "649", x = k27ac_pdata$Sample_Name)
k27ac_pdata = merge(k27ac_pdata, pData(tumor_data), all.x = TRUE)

table(k27ac_pdata$Cluster)
## 
## C1 C2 C3 C4 C5 
##  0  7  0  2  3
#Process bigWigs with bwtool to get the peak intensities (area under the peak)
pdata_act_enh = data.table::copy(x = k27ac_pdata)
pdata_act_enh = pdata_act_enh[pdata_act_enh$Tissue %in% "Leukemic_BM",]
pdata_act_enh = peakSeason::read_coldata(coldata = pdata_act_enh, files_idx = 3, sample_idx = 1)
## Checking for files..
## Done!
#Extract peak intensities for active K27ac peaks (peaks non-overlapping with known TSS)
pdata_act_enh_auc = peakSeason::extract_summary(
  coldata = pdata_act_enh,
  bed = "data/BLUEPRINT/H3K27ac/narrowPeak_minusTSS/consensus_active_disease_enhancers.bed",
  op_dir = tempdir(check = TRUE),
  nthreads = 12,
  rmAfter = TRUE
)
## Done!
summary_mat = pdata_act_enh_auc$summaries[,6:ncol(pdata_act_enh_auc$summaries)]
summary_mat = data.table::setDF(x = summary_mat)
summary_mat = summary_mat[complete.cases(summary_mat),]
summary_mat = order_by_sds(mat = summary_mat)
#PCA on top 25000 peaks
summary_mat_pca = prcomp(t(summary_mat[1:25000,]))
summary_mat_pca_dat = as.data.frame(summary_mat_pca$x[,c("PC1", "PC2", "PC3", "PC4"), drop= FALSE])
var_explained_k27ac = summary_mat_pca$sdev^2/sum(summary_mat_pca$sdev^2)

summary_mat_pca_dat = cbind(summary_mat_pca_dat, Cluster = pdata_act_enh_auc$cdata$Cluster)

H3K4me3 data

#K4me3 narropeaks
k4me3_files = list.files(path = "data/BLUEPRINT/H3K4me3/narrowPeak/", pattern = '*\\.narrowPeak$', all.files = TRUE, recursive = TRUE, full.names = TRUE)

#bigWig files
bw_files = list.files(path = "data/BLUEPRINT/H3K4me3/bigWigs/", pattern = '_treat_pileup.bw', all.files = TRUE, recursive = TRUE, full.names = TRUE)

k4me3_pdata = data.frame(bw_files, narrowpeaks = k4me3_files, stringsAsFactors = FALSE)
k4me3_pdata$Sample_Name = paste0(
  "sample_",
  gsub(
    pattern = "_H3K4me3_treat_pileup.bw|b|_treat_pileup.bw",
    replacement = "",
    x = basename(path = k4me3_pdata$bw_files)
  )
)

#T13C is sample649
k4me3_pdata$Sample_Name = gsub(pattern = "T13C", replacement = "649", x = k4me3_pdata$Sample_Name)
k4me3_pdata = merge(k4me3_pdata, pData(tumor_data), all.x = TRUE)

table(k4me3_pdata$Cluster)
## 
## C1 C2 C3 C4 C5 
##  0  7  0  1  3
#Process bigWigs with bwtool to get the peak intensities (area under the peak)
pdata_promoters = data.table::copy(x = k4me3_pdata)
pdata_promoters = pdata_promoters[pdata_promoters$Tissue %in% "Leukemic_BM",]
pdata_promoters = peakSeason::read_coldata(coldata = pdata_promoters, files_idx = 2, sample_idx = 1)
## Checking for files..
## Done!
#Extract peak intensities for k4me3 peaks
pdata_promoters_auc = peakSeason::extract_summary(
  coldata = pdata_promoters,
  bed = "data/BLUEPRINT/H3k4me3/narrowPeak/consensus_active_disease_promoters.bed",
  op_dir = tempdir(check = TRUE),
  nthreads = 12,
  rmAfter = TRUE
)
## Done!
summary_mat_prom = pdata_promoters_auc$summaries[,6:ncol(pdata_promoters_auc$summaries)]
summary_mat_prom = data.table::setDF(x = summary_mat_prom)
summary_mat_prom = summary_mat_prom[complete.cases(summary_mat_prom),]
summary_mat_prom = order_by_sds(mat = summary_mat_prom)
#PCA on top 25000 peaks
summary_mat_prom_pca = prcomp(t(summary_mat_prom[1:25000,]))
summary_mat_prom_pca_dat = as.data.frame(summary_mat_prom_pca$x[,c("PC1", "PC2", "PC3", "PC4"), drop= FALSE])
var_explained_k4me3 = summary_mat_prom_pca$sdev^2/sum(summary_mat_prom_pca$sdev^2)

summary_mat_prom_pca_dat = cbind(summary_mat_prom_pca_dat, Cluster = pdata_promoters_auc$cdata$Cluster)
par(mfrow = c(1, 2), mar = c(3, 3, 1, 1))
plot(NA, axes = FALSE, xlab = NA, ylab = NA, cex = 1.2, xlim = range(pretty(summary_mat_pca_dat$PC1)), ylim = range(pretty(summary_mat_pca_dat$PC2)))
abline(h = pretty(summary_mat_pca_dat$PC2), v = pretty(summary_mat_pca_dat$PC1), col = grid_cols, lty = 2)
points(x = summary_mat_pca_dat$PC1, y = summary_mat_pca_dat$PC2, bg = clust_cols[summary_mat_pca_dat$Cluster], pch = 21, col = "black")
axis(side = 1, at = pretty(summary_mat_pca_dat$PC1), lwd = 1.5, font = 1, cex.axis = 0.8)
axis(side = 2, at = pretty(summary_mat_pca_dat$PC2), lwd = 1.5, font = 1, las = 2, cex.axis = 0.8)
mtext(text = paste0("PC2 [", round(var_explained_k27ac[2], digits = 2), "]"), side = 2, line = 2, cex = 0.8)
mtext(text = paste0("PC1 [", round(var_explained_k27ac[1], digits = 2), "]"), side = 1, line = 2, cex = 0.8)
title(main = "H3K27ac", adj = 0, cex.main = 0.8)
legend(x = "bottomright", legend = names(clust_cols[c("C1", "C2", "C4", "C5")]), col = clust_cols[c("C1", "C2", "C4", "C5")], pt.bg = clust_cols[1:6], pch = 19, pt.cex = 0.6, cex = 0.6)

plot(NA, axes = FALSE, xlab = NA, ylab = NA, cex = 1.2, xlim = range(pretty(summary_mat_prom_pca_dat$PC1)), ylim = range(pretty(summary_mat_prom_pca_dat$PC2)))
abline(h = pretty(summary_mat_prom_pca_dat$PC2), v = pretty(summary_mat_prom_pca_dat$PC1), col = grid_cols, lty = 2)
points(x = summary_mat_prom_pca_dat$PC1, y = summary_mat_prom_pca_dat$PC2, bg = clust_cols[summary_mat_prom_pca_dat$Cluster], pch = 21, col = "black")
axis(side = 1, at = pretty(summary_mat_prom_pca_dat$PC1), lwd = 1.5, font = 1, cex.axis = 0.8)
axis(side = 2, at = pretty(summary_mat_prom_pca_dat$PC2), lwd = 1.5, font = 1, las = 2, cex.axis = 0.8)
mtext(text = paste0("PC2 [", round(var_explained_k4me3[2], digits = 2), "]"), side = 2, line = 2, cex = 0.8)
mtext(text = paste0("PC1 [", round(var_explained_k4me3[1], digits = 2), "]"), side = 1, line = 2, cex = 0.8)
title(main = "H3K4me3", adj = 0, cex.main = 0.8)

Super enhancer landscapes

Super enhancers per cluster were identified by merging all BAM files from same cluster follwed by analysis with ROSE.

se_cluster_files = list.files(path = "data/BLUEPRINT/H3K27ac/SE_clusters/", full.names = TRUE, pattern = "_peaks_merged_H3K27ac_AllEnhancers_ENHANCER_TO_GENE.txt")
se_normal_files = k27ac_pdata[is.na(k27ac_pdata$Tissue), "se_files"]
layout(matrix(data = 1:5, nrow = 1, ncol = 5))
temp_var = lapply(X = seq_len(length(se_normal_files)), function(i){
  bn = gsub(pattern = "_blacklist_filtered_AllEnhancers_ENHANCER_TO_GENE.txt", replacement = "", x = basename(se_normal_files[i]))
  plot_SE(enhToGene = se_normal_files[i], sample = bn)
})

rm(temp_var)
layout(matrix(data = 1:3, nrow = 1, ncol = 3))
temp_var = lapply(X = seq_len(length(se_cluster_files)), function(i){
  bn = gsub(pattern = "_peaks_merged_H3K27ac_AllEnhancers_ENHANCER_TO_GENE.txt", replacement = "", x = basename(se_cluster_files[i]))
  plot_SE(enhToGene = se_cluster_files[i], sample = bn)
}) 

rm(temp_var)

Enhancer distribution per sample

se_ntbl = lapply(k27ac_pdata$se_files, function(x ){data.table::fread(x)[, .N, V14]})
names(se_ntbl) = k27ac_pdata$Sample_Name
se_ntbl = data.table::rbindlist(l = se_ntbl, idcol = "Sample_Name")
se_ntbl = merge(se_ntbl, k27ac_pdata[,c("Sample_Name", "Cluster")], by = "Sample_Name")
se_ntbl$Cell_Type = ifelse(is.na(as.character(se_ntbl$Cluster)), yes = unlist(data.table::tstrsplit(x = se_ntbl$Sample_Name, "_", keep = 2)), no = as.character(se_ntbl$Cluster))
se_ntbl$Condition = ifelse(is.na(as.character(se_ntbl$Cluster)), yes = "Normal", no = "Tumor")
colnames(se_ntbl)[2] = "isSuper"
se_ntbl$isSuper = ifelse(test = se_ntbl$isSuper == 1, yes = "SE", no = "TE")

#Sumary table for SE/TE numbers
se_ntbl_sup = data.table::dcast(data = se_ntbl, Sample_Name ~ isSuper, value.var = "N")
se_ntbl_sup = merge(se_ntbl_sup, se_ntbl[,.(Sample_Name, Cell_Type)])[!duplicated(Sample_Name)][order(Cell_Type)]
cell_type_cols = c(clust_cols[c("C2", "C4", "C5")], "CD34" = "#1B9E77", EC = "#D95F02", LC = "#7570B3", SP4 = "#66A61E", SP8 = "#E6AB02")

layout(mat = matrix(c(1, 2), ncol = 2, nrow = TRUE), widths = c(3, 1))

par(mar = c(3, 3, 1, 1))
b = boxplot(N ~ Cell_Type, data = se_ntbl[isSuper == "SE"], xaxt = "n", boxwex = 0.5, outline = TRUE, lty = 1.5, lwd = 1.8, outwex = 0, staplewex = 0, ylim = c(0, 1200), axes = FALSE, border = cell_type_cols)
abline(v = 1:9, h = pretty(c(0, 1200)), lty = 2, col = grid_cols)
axis(side = 1, at = 1:8, labels = names(cell_type_cols), lwd = 1.5, font = 1, las = 2, cex.axis = 0.6)
axis(side = 2, at = pretty(c(0, 1200)), las = 2, lwd = 1.5, font = 1, las = 2, cex.axis = 0.6)


par(mar = c(3, 0, 1, 1))
b = boxplot(N ~ Condition, data = se_ntbl[isSuper == "SE"], xaxt = "n", boxwex = 0.5, outline = TRUE, lty = 1.5, lwd = 1.8, outwex = 0, staplewex = 0, ylim = c(0, 1200), axes = FALSE, border = c("royalblue", "maroon"))
axis(side = 1, at = 1:2, labels = c("Thymic", "T-ALL"), lwd = 1.5, font = 1, las = 2, cex.axis = 0.6)
abline(v = 1:2, h = pretty(c(0, 1200)), lty = 2, col = grid_cols)

#axis(side = 2, at = pretty(c(0, 1200)), las = 2, lwd = 1.5, font = 1, las = 2, cex.axis = 1.8)

t.test(x = se_ntbl[isSuper == "SE"][Condition == 'Tumor', N], y = se_ntbl[isSuper == "SE"][Condition == 'Normal', N])
## 
##  Welch Two Sample t-test
## 
## data:  se_ntbl[isSuper == "SE"][Condition == "Tumor", N] and se_ntbl[isSuper == "SE"][Condition == "Normal", N]
## t = 3.4516, df = 14.951, p-value = 0.003575
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  114.1345 482.9655
## sample estimates:
## mean of x mean of y 
##    658.75    360.20

LOLA enrichment analysis

dbPath = file.path("data/LOLA/")
regionDB = LOLA::loadRegionDB(dbLocation = dbPath)
## Reading collection annotations:
##  In collection 'disease_enhancers', consider adding a 'collection.txt' annotation file.
##  In collection 'disease_promoters', consider adding a 'collection.txt' annotation file.
##  In collection 'Genomic', consider adding a 'collection.txt' annotation file.
##  In collection 'normal_enhancers', consider adding a 'collection.txt' annotation file.
##  In collection 'regulatory', consider adding a 'collection.txt' annotation file.
##  In collection 'repressors', consider adding a 'collection.txt' annotation file.
## Reading region annotations...
## ::Loading cache::    data/LOLA/disease_enhancers//disease_enhancers_files.RData
## ::Loading cache::    data/LOLA/disease_promoters//disease_promoters_files.RData
## ::Loading cache::    data/LOLA/Genomic//Genomic_files.RData
## ::Loading cache::    data/LOLA/normal_enhancers//normal_enhancers_files.RData
## ::Loading cache::    data/LOLA/regulatory//regulatory_files.RData
## ::Loading cache::    data/LOLA/repressors//repressors_files.RData
## disease_enhancers
## ::Loading cache::    data/LOLA/disease_enhancers/disease_enhancers.RData
## disease_promoters
## ::Loading cache::    data/LOLA/disease_promoters/disease_promoters.RData
## Warning in .Seqinfo.mergexy(x, y): Each of the 2 combined objects has sequence levels not in the other:
##   - in 'x': chr17_gl000205_random, chrUn_gl000219, chrUn_gl000220, chr7_gl000195_random, chrUn_gl000218, chrUn_gl000225
##   - in 'y': chrY, chr17_gl000204_random
##   Make sure to always combine/compare objects based on the same reference
##   genome (use suppressWarnings() to suppress this warning).
## Genomic
## ::Loading cache::    data/LOLA/Genomic/Genomic.RData
## Warning in .Seqinfo.mergexy(x, y): Each of the 2 combined objects has sequence levels not in the other:
##   - in 'x': chrUn_gl000225, chr17_gl000204_random
##   - in 'y': chrM, chr17_ctg5_hap1, chr19_gl000209_random, chr1_gl000191_random, chr4_ctg9_hap1, chr4_gl000193_random, chr4_gl000194_random, chr6_apd_hap1, chr6_cox_hap2, chr6_dbb_hap3, chr6_mann_hap4, chr6_mcf_hap5, chr6_qbl_hap6, chr6_ssto_hap7, chrUn_gl000211, chrUn_gl000212, chrUn_gl000213, chrUn_gl000215, chrUn_gl000222, chrUn_gl000223, chrUn_gl000224, chrUn_gl000227, chrUn_gl000228, chrUn_gl000241
##   Make sure to always combine/compare objects based on the same reference
##   genome (use suppressWarnings() to suppress this warning).
## normal_enhancers
## ::Loading cache::    data/LOLA/normal_enhancers/normal_enhancers.RData
## regulatory
## ::Loading cache::    data/LOLA/regulatory/regulatory.RData
## Warning in .Seqinfo.mergexy(x, y): Each of the 2 combined objects has sequence levels not in the other:
##   - in 'x': chr17_gl000205_random, chrUn_gl000219, chrUn_gl000220, chr7_gl000195_random, chrUn_gl000218, chrUn_gl000225, chr17_ctg5_hap1, chr19_gl000209_random, chr1_gl000191_random, chr4_ctg9_hap1, chr4_gl000194_random, chr6_apd_hap1, chr6_cox_hap2, chr6_dbb_hap3, chr6_mann_hap4, chr6_mcf_hap5, chr6_qbl_hap6, chr6_ssto_hap7, chrUn_gl000211, chrUn_gl000213, chrUn_gl000215, chrUn_gl000222, chrUn_gl000223, chrUn_gl000224, chrUn_gl000227, chrUn_gl000228, chrUn_gl000241
##   - in 'y': chr11_gl000202_random
##   Make sure to always combine/compare objects based on the same reference
##   genome (use suppressWarnings() to suppress this warning).
## repressors
## ::Loading cache::    data/LOLA/repressors/repressors.RData
#All hyper DMPs as a background
bg_loci_all_dmps = data.table::rbindlist(l = dmps)[adj.P.Val < 0.05 & abs(logFC) > 0.2][!duplicated(rn)][,.(Chr, Start, end = End+1)]
bg_loci = GenomicRanges::makeGRangesFromDataFrame(df = bg_loci_all_dmps)
rm(bg_loci_all_dmps)

Run LOLA for all hyper DMPs

dmp_dat_gr = lapply(dmps, function(x) GenomicRanges::makeGRangesFromDataFrame(df = x[adj.P.Val < 0.05 & logFC > 0.2][,.(Chr, Start, end = End+1)]))

lola_res_hyper = LOLA::runLOLA(userSets = dmp_dat_gr, userUniverse = bg_loci, regionDB = regionDB, cores = 2)
## Calculating unit set overlaps...
## Calculating universe set overlaps...
## Calculating Fisher scores...
lola_res_hyper$pValueLog = ifelse(test = is.infinite(x = lola_res_hyper$pValueLog), yes = -log10(.Machine$double.xmin), no = lola_res_hyper$pValueLog)
lola_res_hyper$qValue = p.adjust(10^-lola_res_hyper$pValueLog, "fdr")
lola_res_hyper$significant = ifelse(test = lola_res_hyper$qValue < 0.01, yes = "Yes", no = "No")

#Plot LOLA results
lola_res_hyper$filename = factor(x = lola_res_hyper$filename, levels = unique(lola_res_hyper[order(collection, filename)][,filename]))

Run LOLA for all hypo DMPs

dmp_dat_gr = lapply(dmps, function(x) GenomicRanges::makeGRangesFromDataFrame(df = x[adj.P.Val < 0.05 & logFC < -0.2][,.(Chr, Start, end = End+1)]))

lola_res_hypo = LOLA::runLOLA(userSets = dmp_dat_gr, userUniverse = bg_loci, regionDB = regionDB, cores = 2)
## Calculating unit set overlaps...
## Calculating universe set overlaps...
## Calculating Fisher scores...
lola_res_hypo$pValueLog = ifelse(test = is.infinite(x = lola_res_hypo$pValueLog), yes = -log10(.Machine$double.xmin), no = lola_res_hypo$pValueLog)
lola_res_hypo$qValue = p.adjust(10^-lola_res_hypo$pValueLog, "fdr")
lola_res_hypo$significant = ifelse(test = lola_res_hypo$qValue < 0.01, yes = "Yes", no = "No")

lola_res_hypo$filename = factor(x = lola_res_hypo$filename, levels = unique(lola_res_hypo[order(collection, filename)][,filename]))
lola_res_tbl = data.table::rbindlist(list(Hyper = lola_res_hyper, Hypo = lola_res_hypo), idcol = "Status")
lola_res_tbl$wrap = ifelse(test = lola_res_tbl$filename %like% "SE", yes = "SE", no = ifelse(lola_res_tbl$filename %like% "TE", yes = "TE", no = lola_res_tbl$description))

lola_res_tbl[, filename := gsub(pattern = ".hg19|.bed", replacement = "", x = lola_res_tbl$filename)]
lola_res_tbl$filename = factor(x = lola_res_tbl$filename, levels = rev(c("Thymic_active_enhancers", "Thymic_primed_enhancers", "Thymic_putative_enhancers", "Thymic_active_promoters", "Thymus_K27Me3", "CD34_SE", "EC_SE", "LC_SE", "SP4_SE", "SP8_SE", "CD34_TE", "EC_TE", "LC_TE", "SP4_TE", "SP8_TE", "Promoters", "Intergenic", "TALL_K27Me3", "C2_consensus_K4me3", "C4_consensus_K4me3", "C5_consensus_K4me3", "C2_SE", "C4_SE", "C5_SE", "C2_TE", "C4_TE", "C5_TE")))

NOTE Below plot might look slightly different based on the LOLA version.

library(ggplot2)
ggplot(data = lola_res_tbl, aes(y = filename, x = userSet, fill = oddsRatio, color = significant))+coord_fixed()+geom_point(aes(size = pValueLog), pch =21)+theme_minimal(base_size = 10)+scale_fill_gradient(low = "gray", high = "black")+scale_color_manual(values = c(No = "darkgreen", Yes = "red"))+xlab("Cluster")+facet_wrap(~Status)+ylab(label = element_blank())

Homer motif results

Motif analysis was done using homer findMotifsGenome.pl for hypo DMPs (+/- 100bp) that are located within SE/TE regions from T-ALL. Results from homer - knownResults.txt are stored in data/homer_motifs directory.

#P-value threshold for homer motif results
pthr = 1e-5

kr_files = list.files(path = "data/homer_motifs/", pattern = "knownResults.txt", recursive = TRUE, full.names = TRUE)
homer_res_dat = lapply(kr_files, data.table::fread)
names(homer_res_dat) = paste0("C", 1:5)
homer_motifs = lapply(homer_res_dat, function(x) {
  m = x[`P-value` <= pthr]
  if (length(m) == 0) {
    return(NULL)
  }
  colnames(m) = c("Motif_Name", "Consensus", "P_value", "Log_P_value", "Q_value", "n_targets", "fract_targets", "n_bg_targets", "fract_bg_targets")
  m
})


homer_motifs = data.table::rbindlist(l = homer_motifs, idcol = "Cluster", fill = TRUE, use.names = TRUE)
homer_motifs$Motif = unlist( data.table::tstrsplit(x = homer_motifs$Motif_Name, split = "/", keep = 1))
homer_motifs$Motif_name = data.table::tstrsplit(x = homer_motifs$Motif, split = "\\(", keep = 1)
homer_motifs$Motif_family = data.table::tstrsplit(x = homer_motifs$Motif, split = "\\(", keep = 2)
homer_motifs$Motif_family = gsub(x = homer_motifs$Motif_family, pattern = "\\)$", replacement = "")

homer_motifs_ptbl = data.table::dcast(data = homer_motifs, Motif_Name ~ Cluster, value.var = "P_value")
writexl::write_xlsx(x = homer_motifs_ptbl, path = "Supplementary_table_S7.xlsx", col_names = TRUE)


#After discussing with Aurore following motifs are decided to be shown. Rest goto to supplemental table.
homer_motifs$Motif_name = toupper(x = homer_motifs$Motif_name) #Convert to upper cases (these are consevred across all vertebrates - double checked manually)
homer_motifs_tar = homer_motifs[Motif_name %in% c("CEBP", "CHOP", "HLF", "NFIL3", "ATF4", "AP-1", "PU.1", "ETS1", "FLI1", "NF1", "RUNX1", "MYB", "TAL1", "LEF1", "E2A", "HEB", "TCF4", "BORIS", "GATA1", "GATA2", "GATA3", "GATA4", "GATA6", "HOXD11", "HOXD13", "HOXB13", "HOXA13", "HOXA11")]
homer_motifs_tar[, id := paste0(Cluster, "_", Motif_name)]
homer_motifs_tar = homer_motifs_tar[!duplicated(id)]
homer_motifs_tar = data.table::dcast(data = homer_motifs_tar, Motif_name ~ Cluster, value.var = 'Log_P_value')
homer_motifs_tar_plot_dat = merge(homer_motifs_tar, homer_motifs[,.(Motif_name, Motif_family)], by = "Motif_name")
homer_motifs_tar_plot_dat = homer_motifs_tar_plot_dat[!duplicated(Motif_name)]
homer_motifs_tar_plot_dat = homer_motifs_tar_plot_dat[order(Motif_family, decreasing = TRUE)]
data.table::setDF(x = homer_motifs_tar_plot_dat, rownames = homer_motifs_tar_plot_dat$Motif_name)
homer_motifs_tar_plot_dat = homer_motifs_tar_plot_dat[,-1]
#Re-order
homer_motifs_tar_plot_dat = homer_motifs_tar_plot_dat[c("CEBP", "CHOP", "HLF", "NFIL3", "ATF4", "AP-1", "PU.1", "ETS1", "FLI1", "NF1", "RUNX1", "MYB", "TAL1", "LEF1", "E2A", "HEB", "TCF4", "BORIS", "GATA1", "GATA2", "GATA3", "GATA4", "GATA6", "HOXD11", "HOXD13", "HOXB13", "HOXA13", "HOXA11"),]
fam_order = homer_motifs_tar_plot_dat$Motif_family
homer_motifs_tar_plot_dat = homer_motifs_tar_plot_dat[,-6]
homer_motifs_tar_plot_dat[is.na(homer_motifs_tar_plot_dat)] = 0
homer_motifs_tar_plot_dat[homer_motifs_tar_plot_dat < 0] = 1

homer_motifs_tar_plot_dat = as.matrix(homer_motifs_tar_plot_dat)
lo = matrix(data = 1:3, nrow = 1, ncol = 3)
layout(mat = lo, widths = c(1, 1, 2))

par(mar = c(0, 0, 0, 0))
plot(NA, xlim = c(0, 1), ylim = c(1, nrow(homer_motifs_tar_plot_dat)), axes = FALSE, xlab = NA, ylab = NA)
text(x = 0.5, y = 1:nrow(homer_motifs_tar_plot_dat), labels = rownames(homer_motifs_tar_plot_dat), font = 3)

par(mar = c(0, 0, 0, 0))
plot(NA, xlim = c(0, 1), ylim = c(1, nrow(homer_motifs_tar_plot_dat)), axes = FALSE, xlab = NA, ylab = NA)
text(x = 0.5, y = 1:nrow(homer_motifs_tar_plot_dat), labels = fam_order, cex = 0.8)

par(mar = c(0, 0, 0, 0))
plot(NA, xlim = c(1, ncol(homer_motifs_tar_plot_dat)), ylim = c(1, nrow(homer_motifs_tar_plot_dat)), axes = FALSE, xlab = NA, ylab = NA)
for(i in seq_len(ncol(homer_motifs_tar_plot_dat))){
  x = homer_motifs_tar_plot_dat[,i]
  for(xi in seq_along(x)){
    if(x[xi] == 0){
      points(x = i, y = xi, pch = 21, col = "black")
    }else{
      points(x = i, y = xi, pch = 21, bg = "#02075d", col = 'black')
    }
  }
}

RNA-seq analysis

counts = read.delim(file = "data/BLUEPRINT_expression/count_matrix.tsv.gz", row.names = 1)
anno = read.delim("data/BLUEPRINT_expression/pheno_data.tsv.gz", row.names = 1)
counts = counts[,rownames(anno)]

if(!all(rownames(anno) == colnames(counts))){
  stop("Sample order do not match!")
}

#All measured genes
dim(counts)
## [1] 50572    48
#Restricting all ananlysis to protein coding genes
ens_genes = data.table::fread(input = "data/extdata/ens_gene_biotypes.tsv", header = FALSE)
ens_genes = ens_genes[V4 %in% "protein_coding"][V1 %in% 1:22]

counts = counts[rownames(counts) %in% ens_genes$V3,] #Protein coding genes

#All protein coding genes
dim(counts)
## [1] 16624    48
dds = DESeq2::DESeqDataSetFromMatrix(countData = counts, colData = anno, design = ~Platform+Cluster)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = DESeq2::DESeq(object = dds, parallel = TRUE)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 8 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 8 workers
## -- replacing outliers and refitting for 351 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
vsd = DESeq2::vst(object = dds, blind = FALSE)
vsd_log_mat = order_by_sds(mat = as.data.frame(assay(vsd)))

batch correction on VSD (for visualization)

mod = model.matrix(~as.factor(Cluster), data=colData(vsd))
vsd_batch_corrected = sva::ComBat(dat = assay(vsd), batch = vsd$Platform, par.prior = TRUE, mod = mod)
## Found 2360 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found3batches
## Adjusting for5covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
#vsd_limma = limma::removeBatchEffect(x = assay(vsd), batch = vsd$Platform) #Using limma

vsd_batch_corrected = order_by_sds(as.data.frame(vsd_batch_corrected))
vsd_pca = prcomp(x = t(vsd_batch_corrected[1:1000,]))

vsd_pca_dat = as.data.frame(vsd_pca$x[,c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), drop= FALSE])
vsd_pca_var_explained = vsd_pca$sdev^2/sum(vsd_pca$sdev^2)
vsd_pca_dat = cbind(vsd_pca_dat, colData(vsd))
vsd_pca_dat$clust_color = clust_cols[vsd_pca_dat$Cluster]

#layout(mat = matrix(c(1, 2), nrow = 2, ncol = 1), heights = c(4, 2))
par(mar = c(3, 3, 1, 1))
plot(NA, axes = FALSE, xlab = NA, ylab = NA, cex = 1.2, xlim = c(-50, 50), ylim = range(pretty(vsd_pca_dat$PC2)))
abline(h = pretty(vsd_pca_dat$PC2), v = pretty(vsd_pca_dat$PC1), col = "gray90", lty = 2)
vsd_pca_dat$clust_color = ifelse(test = is.na(vsd_pca_dat$clust_color), yes = "gray70", no = vsd_pca_dat$clust_color)
points(x = vsd_pca_dat$PC1, y = vsd_pca_dat$PC2, col = "black", bg = vsd_pca_dat$clust_color, pch = 21, cex = 1)
axis(side = 1, at = seq(-50, 50, 25), cex.axis = 0.8)
axis(side = 2, at = pretty(vsd_pca_dat$PC2), las = 2, cex.axis = 0.8)
mtext(text = paste("PC2 ", round(vsd_pca_var_explained[2], digits = 2)), side = 2, line = 2, cex = 0.7)
mtext(text = paste("PC1 ", round(vsd_pca_var_explained[1], digits = 2)), side = 1, line = 2, cex = 0.7)
#plot.new()
#par(mar = c(0, 0, 0, 0))
legend(x = "topleft", legend = names(clust_cols), col = clust_cols, pch = 19, xpd = TRUE, ncol = 1, cex = 0.6)

coldata_tumor = anno[!anno$Cluster %in% "Normal",]
dim(coldata_tumor)
## [1] 44 58
hmanno = coldata_tumor[,c("TLX1", "TLX3", "TAL1", "HOXA_cis", "HOXA_trans", "HOXA_nd", "ETP_Phenotype", "DNMT3A", "IDH2", "Cohort", "Cluster")]

hmanno$TAL1 = as.character(hmanno$TAL1)
hmanno$TLX3 = as.character(hmanno$TLX3)
hmanno$TLX1 = as.character(hmanno$TLX1)
#hmanno$CALM_AF10 = as.character(hmanno$CALM_AF10)
hmanno$HOXA_cis = as.character(hmanno$HOXA_cis)
hmanno$HOXA_trans = as.character(hmanno$HOXA_trans)
hmanno$HOXA_nd = as.character(hmanno$HOXA_nd)
hmanno$ETP_Phenotype = ifelse(test = as.character(hmanno$ETP_Phenotype) == "Yes", yes = "1", no = "0")
hmanno$DNMT3A = ifelse(test = as.character(hmanno$DNMT3A) == "Mutant", yes = "1", no = "0")
hmanno$IDH2 = ifelse(test = as.character(hmanno$IDH2) == "Mutant", yes = "1", no = "0")

yesno_cols = c("1" = "maroon", "0" = "white")

#tall_genes = c("TLX1", "TLX3", "HOXA9", "LYL1", "LMO1", "LMO2", "TAL1", "TAL2", "NKX2-1", "HOXA11")
tall_genes = c("TLX1", "TLX3", "HOXA9", "TAL1")

tumsamps = rownames(coldata_tumor)
hmanno$Cohort = NULL
pheatmap::pheatmap(vsd_log_mat[tall_genes, tumsamps], annotation_col = hmanno, scale = 'row', show_colnames = FALSE, annotation_colors = list(pred = clust_cols, ETP_Phenotype = yesno_cols, TAL1 = yesno_cols, TLX3 = yesno_cols, TLX1 = yesno_cols, CALM_AF10 = yesno_cols, HOXA_cis = yesno_cols, HOXA_trans = yesno_cols, HOXA_nd = yesno_cols, DNMT3A = yesno_cols, IDH2 = yesno_cols, Cluster = clust_cols)) 

#

DEGs: Clusters v/s Thymus

clusters = paste0("C", 1:5)
res = lapply(clusters, function(r){
  rds = results(object = dds, contrast = c("Cluster", r, "Normal"), lfcThreshold = 1)
  #rds <- lfcShrink(dds, contrast = c("Cluster", r, "Th"), res = rds, type = "normal")
  rds = as.data.table((as.data.frame(rds)), keep.rownames = TRUE)
  rds[order(padj)]})

names(res) = clusters

sup8_data = lapply(res, function(x){
  x[padj < 0.1]
})

writexl::write_xlsx(x = sup8_data, path = "Supplementary_table_S8.xlsx", col_names = TRUE, format_headers = TRUE)
significantGenesUpTbl = sapply(res, function(x){
  table(cut(x[padj < 0.1][log2FoldChange > 0][,log2FoldChange], breaks = c(seq(0, 15, 3), 50)))
})

significantGenesDownTbl = sapply(res, function(x){
  table(cut(x[padj < 0.1][log2FoldChange < 0][,log2FoldChange], breaks = c(-50, seq(0, -15, -3))))
})
par(mar = c(3, 3, 1, 1))
b1 = barplot(height = significantGenesUpTbl, ylim = c(-1000, 1000),  col = brewer.pal(8,"Oranges"), names.arg = rep(NA, ncol(significantGenesUpTbl)), axes = FALSE, border = "#34495e")
b2 = barplot(height = -significantGenesDownTbl[6:1,], add = TRUE, col = brewer.pal(8,"Blues"), names.arg = rep(NA, ncol(significantGenesUpTbl)), axes = FALSE, border = "#34495e")

text(x = b1, y = colSums(x = significantGenesUpTbl)+50, labels = colSums(x = significantGenesUpTbl), cex = 0.7, xpd = TRUE)
text(x = b2, y = -(colSums(x = significantGenesDownTbl)+50), labels = colSums(x = significantGenesDownTbl), cex = 0.7)

mtext(text = colnames(significantGenesUpTbl), side = 1, at = b1, las = 1, cex = 0.7)
mtext(text = "DEGs", side = 2, line = 2, cex = 0.8)
axis(side = 2, at = seq(-1000, 1000, by = 250), las = 2, cex.axis = 0.7)

image(
    x = c(0.2, 0.3),
    y = seq(700, 1000, length.out = 4),
    z = matrix(1:4, nrow = 1),
    col = brewer.pal(4,"Oranges"), add = TRUE, xlim = c(1, 8), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA
)

text(x = 0.35, y = seq(700, 1000, length.out = 4), labels = rownames(significantGenesUpTbl), adj = 0, xpd = TRUE, cex = 0.6)


image(
    x = c(0.2, 0.3),
    y = seq(-1000, -700, length.out = 4),
    z = matrix(1:4, nrow = 1),
    col = rev(brewer.pal(4,"Blues")), add = TRUE, xlim = c(1, 8), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA
)

text(x = 0.35, y = seq(-1000, -700, length.out = 4), labels = rownames(significantGenesDownTbl), adj = 0, xpd = TRUE, cex = 0.6)

Correlate promoter DNA-methylation and Expression

epic_anno = data.table::copy(fData(object = meth_exprs))
data.table::setDT(epic_anno)
prom_probes = epic_anno[Annotation %in% 'promoter-TSS'][,.(rn, Hugo_Symbol)]

temp_meth = as.data.table(as.data.frame(exprs(meth_exprs)), keep.rownames = TRUE)
temp_meth = merge(prom_probes, temp_meth, by = 'rn')
temp_meth[,rn := NULL]
prom_meth = temp_meth[,lapply(.SD, mean, na.rm = TRUE), by = Hugo_Symbol]
data.table::setDF(x = prom_meth, rownames = prom_meth$Hugo_Symbol)
prom_meth$Hugo_Symbol = NULL
prom_meth = Biobase::ExpressionSet(
  assayData = as.matrix(prom_meth),
  phenoData = Biobase::AnnotatedDataFrame(pData(meth_exprs))
)

rm(temp_meth, prom_probes)

#Common genes covered by expression and methylation
com_genes = intersect(rownames(prom_meth), rownames(vsd_batch_corrected))
tum_samples = colnames(dds[,dds$Cluster != "Normal"])

Three analysis to be checked:

  1. Check promoter methylation levels of DEGs per cluster
temp_anno = as.data.frame(colData(vsd))

lo = layout(matrix(data = 1:10, nrow = 5, ncol = 2, byrow = TRUE))

temp = lapply(names(res), function(cl){
  de = res[[cl]]
  cl_samps = rownames(temp_anno[temp_anno$Cluster %in% cl, ])
  
  de_upg = intersect(de[padj < 0.1 & log2FoldChange > 0, rn], com_genes) #up-regulated DEGs
  de_dng = intersect(de[padj < 0.1 & log2FoldChange < 0, rn], com_genes)#down-regulated DEGs
  
  down_e = vsd_batch_corrected[de_dng,cl_samps]
  down_e = down_e[complete.cases(down_e),]
  up_e = vsd_batch_corrected[de_upg,cl_samps]
  up_e = up_e[complete.cases(up_e),]
  
  down_m = exprs(prom_meth[de_dng,cl_samps])
  down_m = down_m[complete.cases(down_m),]
  up_m = exprs(prom_meth[de_upg,cl_samps])
  up_m = up_m[complete.cases(up_m),]
  
  par(mar = c(2, 2, 1, 1))
  boxplot(apply(down_e, 1, mean), apply(up_e, 1, mean), ylim = c(4, 16), horizontal = TRUE, notch = TRUE, col = c("#16a085", "#c0392b"))
  pe = round(t.test(apply(down_e, 1, mean), apply(up_e, 1, mean), alternative = "l")$p.value, 2)
  if(pe < 0.001){
    text(x = 16, y = 1.5, labels = "***", srt = 90)  
  }else{
    text(x = 16, y = 1.5, labels = "ns", srt = 90)
  }

  par(mar = c(2, 2, 1, 1))
  boxplot(apply(down_m, 1, median), apply(up_m, 1, median), ylim = c(0, 1), horizontal = TRUE, notch = TRUE, col = c("#16a085", "#c0392b"))
  pm = round(t.test(apply(down_m, 1, median), apply(up_m, 1, median), alternative = "g")$p.value, 2)
  if(pm < 0.001){
    text(x = 1, y = 1.5, labels = "***", srt = 90)  
  }else{
    text(x = 1, y = 1.5, labels = "ns", srt = 90)
  }
})

  1. Correlate promoter methylation level with gene expression irrespective of cluster
m = as.data.frame(exprs(prom_meth[com_genes, tum_samples]))
e = vsd_batch_corrected[com_genes, tum_samples]

if(!all(colnames(m) == colnames(e))){
  stop("Sample names mismatch!")
}

if(!all(rownames(m) == rownames(e))){
  stop("Genes names mismatch!")
}

em_cor = lapply(seq_len(nrow(m)), function(i){
  cor.test(x = as.numeric(m[i,]), y = as.numeric(e[i,]), method = "spearman", alternative = "two.sided", exact = FALSE)
})
## Warning in cor(rank(x), rank(y)): the standard deviation is zero

## Warning in cor(rank(x), rank(y)): the standard deviation is zero

## Warning in cor(rank(x), rank(y)): the standard deviation is zero

## Warning in cor(rank(x), rank(y)): the standard deviation is zero

## Warning in cor(rank(x), rank(y)): the standard deviation is zero

## Warning in cor(rank(x), rank(y)): the standard deviation is zero

## Warning in cor(rank(x), rank(y)): the standard deviation is zero

## Warning in cor(rank(x), rank(y)): the standard deviation is zero
names(em_cor) = rownames(m)

em_cor_summary = lapply(em_cor, function(x){
    data.table::data.table(p_val = unlist(x$p.value), estimate = unlist(x$estimate))
})

em_cor_summary = data.table::rbindlist(em_cor_summary, idcol = "Hugo_Symbol")
em_cor_summary = em_cor_summary[!is.na(estimate)]
em_cor_summary$p_adjusted = p.adjust(em_cor_summary$p_val, method = "hochberg")
em_cor_summary = em_cor_summary[order(p_adjusted)]

dim(em_cor_summary)
## [1] 15889     4
par(mar = c(4, 3, 2, 1))
hist(em_cor_summary$estimate, breaks = 30, col = "#7f8c8d", border = "white", xlim = c(-1, 1), main = NA, xlab = NA, ylab = NA)
hist(em_cor_summary[p_adjusted < 0.1, estimate], breaks = 50, border = "white", add = TRUE, col = "#c0392b")
legend(x = "topright", legend = c("FDR < 0.1"), col = "#c0392b", pch= 15, bty = "n")
mtext(text = "Spearman correlation", side = 1, line = 2.5)

#mtext(text = "Density", side = 2, line = 2)
#text(x = -0.8, y = 250, labels = paste0("N = ", nrow(em_cor_summary[p_adjusted < 0.1])))
  1. Correlate DMPs to nearby gene expression
genes2heihlight = list(C1 = c("AZU1", "CSF3", "EGFL7", "FES", "JDP2", "S100A6", "SLC2A5"), C2 = c("CD160", "CD47", "FOSL1"), C3 = c("MAP3K8", "MYEOV"), C4 = c("NPTX1"), C5 = c("CAPG", "EMP1", "FAM83A", "LGALS1", "NCR1", "RGS17", "TEC", "ALS2CL2", "AMPH", "CMTM8", "DEPDC7", "HOOK1", "MITF", "MPPED2", "PCDH9", "PKNOX2", "PARRES1", "RASEF", "RNF180", "S100A16", "SLFN5"))

par(mfrow = c(1, 5), mar = c(2, 2, 1, 1))
prom_meth_diff = lapply(names(dmps), function(x){
  #Mean lfc of all promoter associated probes
  prom_lfc = dmps[[x]][Annotation %in% "promoter-TSS"][,mean(logFC), .(Hugo_Symbol)]
  prom_lfc = merge(prom_lfc, res[[x]][,.(rn, log2FoldChange, padj)], by.x = 'Hugo_Symbol', by.y = 'rn') #Merge with correpsoiind expression LFC
  prom_lfc = prom_lfc[order(padj)]
  yat = seq(-30, 30, 10)#pretty(prom_lfc[,log2FoldChange])
  plot(NA, xlim = c(-1, 1), ylim = range(yat), axes = FALSE)
  abline(h = yat, v = seq(-1, 1, 0.25), lty = 2, col = "#ecf0f1")
  points(prom_lfc$V1, prom_lfc$log2FoldChange, pch = 19, col = '#ecf0f1', cex = 0.6)
  points(prom_lfc[padj < 0.1, V1], prom_lfc[padj < 0.1, log2FoldChange], pch = 19, col = '#95a5a6', cex = 0.6)
  axis(side = 1, at = seq(-1, 1, 0.25))
  axis(side = 2, at = yat)
  #legend(x = "topleft", pch = 19, col = "black", legend = paste("FDR < 0.1 [", nrow(prom_lfc[padj < 0.1]), "]"), bty = "n", xpd = TRUE)
  title(main = x, adj = 0)
  
  # qlen = quad_lens(prom_lfc[padj < 0.1, V1], prom_lfc[padj < 0.1, log2FoldChange])
  # #qpos = c("topright", "topleft", "bottomleft", "bottomright")
  # for(pos_idx in seq_len(length(qlen))){
  #   legend(x = names(qlen)[pos_idx], legend = paste0("N = ", qlen[[pos_idx]]), bty = "n")  
  # }
  colnames(prom_lfc) = c("Hugo_Symbol", "meth_diff", "exprs_lfc", "exprs_padj")
  prom_lfc$Quadrant = ifelse(
    test = prom_lfc$meth_diff > 0 &
      prom_lfc$exprs_lfc < 0,
    yes = "Q4",
    no = ifelse(
      test = prom_lfc$meth_diff < 0 &
        prom_lfc$exprs_lfc > 0,
      yes = "Q2",
      no = "Q1/Q3"
    )
  )
  
  prom_lfc$is_sig = ifelse(test = prom_lfc$exprs_padj < 0.1, yes = "yes", no = "no")
  
  x_mark_genes = prom_lfc[Hugo_Symbol %in% genes2heihlight[[x]]]
  points(x_mark_genes$meth_diff, x_mark_genes$exprs_lfc, pch = 19, col = 'maroon', cex = 0.6)
  #text(x = x_mark_genes$meth_diff, y = x_mark_genes$exprs_lfc, labels = x_mark_genes$Hugo_Symbol, adj = 1, cex = 0.6)
  
  if(nrow(x_mark_genes[Quadrant %in% "Q2"]) > 0){
    q2marks = x_mark_genes[Quadrant %in% "Q2"]
    text(x = -0.6, y = 30, labels = paste(q2marks$Hugo_Symbol, collapse = "\n"), adj = 0, cex = 0.7, xpd = TRUE)
  }
  
  if(nrow(x_mark_genes[Quadrant %in% "Q4"]) > 0){
    q4marks = x_mark_genes[Quadrant %in% "Q4"]
    text(x = 0.6, y = -20, labels = paste(q4marks$Hugo_Symbol, collapse = "\n"), adj = 1, cex = 0.7, xpd = TRUE)
  }
  
  prom_lfc
})

names(prom_meth_diff) = names(dmps)
#writexl::write_xlsx(x = prom_meth_diff, path = paste0(res_dir, "DMPs_prom_meth_exprs.xlsx"), col_names = T, format_headers = T)

Survival analysis

Based on clusters

surv_dat = pData(tumor_data)[,c("Sample_Name", "Overall_Survival", "Overall_Survival_Status", "Disease_Free_Survival", "Disease_Free_Survival_Status", "Cluster")]
#surv_dat$Disease_Free_Survival = surv_dat$Disease_Free_Survival/365
#surv_dat$Overall_Survival = surv_dat$Overall_Survival/365]
surv_cols = clust_cols[c("C1", "C2", "C3", "C4", "C5")]

#Overall survival
os_cluster = survival::survfit(survival::Surv(Overall_Survival, Overall_Survival_Status) ~ Cluster, surv_dat)
surv_diff = survival::survdiff(formula = survival::Surv(time = Overall_Survival, event = Overall_Survival_Status) ~ Cluster, data = surv_dat)
surv_diff_pval = signif(1 - pchisq(surv_diff$chisq, length(surv_diff$n) - 1), digits = 3)

#Disease free survival
dfs_cluster = survival::survfit(survival::Surv(Disease_Free_Survival, Disease_Free_Survival_Status) ~ Cluster, surv_dat)
df_surv_diff = survival::survdiff(formula = survival::Surv(time = Disease_Free_Survival, event = Disease_Free_Survival_Status) ~ Cluster, data = surv_dat)
df_surv_diff_pval = signif(1 - pchisq(df_surv_diff$chisq, length(df_surv_diff$n) - 1), digits = 3)

Survival plots in the manuscript are shown with risk table. However, here plots are shown without risk table.

par(mar = c(3, 4, 2, 1), mfrow = c(1, 2), cex = 0.7)
plot(NA, xlim = c(0, 2600), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA, lwd = 1.5)
abline(h = seq(0, 1, 0.2), v = seq(0, 2000, 500), col = 'gray80', lty = 2)
lines(os_cluster, col = surv_cols, lwd = 2)
axis(side = 1, at = seq(0, 2600, 500))
axis(side = 2, at = seq(0, 1, 0.2), las = 2)
#text(x = 500, y = 0.2, labels = paste0("P = " , round(surv_diff_pval, digits = 3)))
legend(x = "bottomright", legend = names(surv_cols), col = surv_cols, lty = 1, lwd = 2, cex = 0.8)
mtext(text = "Time (Days)", side = 1, line = 2, cex = 0.8)
mtext(text = "Survival probability", side = 2.5, line = 2, cex = 0.8)
title(main = "Overall Survival", adj = 0)
#plot(os_cluster, col = surv_cols, xlim = c(0, 2000), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA)

plot(NA, xlim = c(0, 2600), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA, lwd = 1.5)
abline(h = seq(0, 1, 0.2), v = seq(0, 2600, 500), col = 'gray80', lty = 2)
lines(dfs_cluster, col = surv_cols, lwd = 2)
axis(side = 1, at = seq(0, 2600, 500))
axis(side = 2, at = seq(0, 1, 0.2), las = 2)
#text(x = 500, y = 0.2, labels = paste0("P = " , round(df_surv_diff_pval, digits = 3)))
legend(x = "bottomright", legend = names(surv_cols), col = surv_cols, lty = 1, lwd = 2, cex = 0.8)
mtext(text = "Time (Days)", side = 1, line = 2, cex = 0.8)
mtext(text = "Survival probability", side = 2.5, line = 2, cex = 0.8)
title(main = "Event Free Survival", adj = 0)

Merge clusters

  • C12 (hypomethylated): C1 and C2 samples (TAL1, DNMT3a/IDH2 mutants)
  • C34 (hyper-intermediate): C3, and C4 samples (TLX3 and TLX1)
  • C5 (hyper): C5 (HOXA9 trans/ETP)
surv_dat$Cluster = as.character(surv_dat$Cluster)
surv_dat$three_group = ifelse(test = surv_dat$Cluster %in% c("C1", "C2"), yes = "C12", no = ifelse(test = surv_dat$Cluster %in% c("C3", "C4"), yes = "C34", no = "C5"))
print(table(surv_dat$three_group))
## 
## C12 C34  C5 
##  48  63  32

Three cluster scenario

#Overall survival
os_cluster = survival::survfit(survival::Surv(Overall_Survival, Overall_Survival_Status) ~ three_group, surv_dat)
surv_diff = survival::survdiff(formula = survival::Surv(time = Overall_Survival, event = Overall_Survival_Status) ~ three_group, data = surv_dat)
surv_diff_pval = signif(1 - pchisq(surv_diff$chisq, length(surv_diff$n) - 1), digits = 3)

#Disease free survival
dfs_cluster = survival::survfit(survival::Surv(Disease_Free_Survival, Disease_Free_Survival_Status) ~ three_group, surv_dat)
df_surv_diff = survival::survdiff(formula = survival::Surv(time = Disease_Free_Survival, event = Disease_Free_Survival_Status) ~ three_group, data = surv_dat)
df_surv_diff_pval = signif(1 - pchisq(df_surv_diff$chisq, length(df_surv_diff$n) - 1), digits = 3)
clust3_col = c("#807DBA", "#238B45", "#F16913")
names(clust3_col) = c("C12", "C34", "C5")

par(mar = c(3, 4, 2, 1), mfrow = c(1, 2), cex = 0.7)
plot(NA, xlim = c(0, 2600), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA, lwd = 1.5)
abline(h = seq(0, 1, 0.2), v = seq(0, 2600, 500), col = 'gray80', lty = 2)
lines(os_cluster, col = clust3_col, lwd = 2)
axis(side = 1, at = seq(0, 2600, 500))
axis(side = 2, at = seq(0, 1, 0.2), las = 2)
legend(x = "bottomright", legend = names(clust3_col), col = clust3_col, lty = 1, lwd = 2, cex = 0.8)
mtext(text = "Time (Days)", side = 1, line = 2, cex = 0.8)
mtext(text = "Survival probability", side = 2.5, line = 2, cex = 0.8)
title(main = "Overall Survival", adj = 0)
#plot(os_cluster, col = surv_cols, xlim = c(0, 2000), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA)

plot(NA, xlim = c(0, 2600), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA, lwd = 1.5)
abline(h = seq(0, 1, 0.2), v = seq(0, 2600, 500), col = 'gray80', lty = 2)
lines(dfs_cluster, col = clust3_col, lwd = 2)
axis(side = 1, at = seq(0, 2600, 500))
axis(side = 2, at = seq(0, 1, 0.2), las = 2)
legend(x = "bottomright", legend = names(clust3_col), col = clust3_col, lty = 1, lwd = 2, cex = 0.8)
mtext(text = "Time (Days)", side = 1, line = 2, cex = 0.8)
mtext(text = "Survival probability", side = 2.5, line = 2, cex = 0.8)
title(main = "Disease Free Survival", adj = 0)

Mice data

Clinical characteristics of xenographed samples

xeno = maftools::read.maf(maf = "data/targetted_NGS/xeno.maf", clinicalData = "data/targetted_NGS/xeno_anno.csv", verbose = FALSE, vc_nonSyn = c("Mutated", "Amplification", "Deletion"), removeDuplicatedVariants = FALSE)
maftools::oncoplot(maf = xeno, colors = vc_color_codes, clinicalFeatures = c("Cluster", "Maturation_Arrest", "ETP_Phenotype", "Oncogenic_TF"), drawColBar = FALSE, drawRowBar = FALSE, annotationFontSize = 0.8, showTumorSampleBarcodes = TRUE, anno_height = 1.75, legendFontSize = 1, showTitle = FALSE, fontSize = 0.5, SampleNamefontSize = 0.7)

Excel sheet data/mice_experiment.xlsx contains results from 5-Aza treatment on mice models including survival, blast counts, and tumor engraftment (measured as a function of bioluminiscence (photons per second)) over the time course.

Xenografts

#Color codes
bg_col = grDevices::adjustcolor(col = 'gray', alpha.f = 0.6)
col_codes = c('curatively' = '#D95F02', 'untreated' = '#1B9E77',  'preventively' = '#E7298A')
lo = layout(mat = matrix(c(1:10), nrow = 2, ncol = 5, byrow = FALSE))

mice_surv(excel_sheet = "data/mice_experiment.xlsx", sheet_num = 1)
title(main = "UPNT-M149 (C3)", adj = 0, cex.main = 0.8)
mtext(text = "Surv. probability", side = 2, cex = 0.8, line = 1.8)
plot_avg_blast(excel_sheet = "data/mice_experiment.xlsx", sheet_number = 2)
mtext(text = "% BLASTS", side = 2, cex = 0.8, line = 1.8)

mice_surv(excel_sheet = "data/mice_experiment.xlsx", sheet_num = 3)
title(main = "UPNT-670 (C4)", adj = 0, cex.main = 0.8)
plot_avg_blast(excel_sheet = "data/mice_experiment.xlsx", sheet_number = 4)

mice_surv(excel_sheet = "data/mice_experiment.xlsx", sheet_num = 5)
title(main = "UPNT-529 (C5)", adj = 0, cex.main = 0.8)
plot_avg_blast(excel_sheet = "data/mice_experiment.xlsx", sheet_number = 6)

mice_surv(excel_sheet = "data/mice_experiment.xlsx", sheet_num = 7)
title(main = "UPNT-M525 (C2)", adj = 0, cex.main = 0.8)
plot_avg_blast(excel_sheet = "data/mice_experiment.xlsx", sheet_number = 8)

mice_surv(excel_sheet = "data/mice_experiment.xlsx", sheet_num = 9)
title(main = "UPNT-M894 (C2)", adj = 0, cex.main = 0.8)
plot_avg_blast(excel_sheet = "data/mice_experiment.xlsx", sheet_number = 10)

ALL-SIL cell line

par(mfrow = c(1, 2))
mice_surv(excel_sheet = "data/mice_experiment.xlsx", sheet_num = 11)
allsil_biolumi = readxl::read_xlsx(path = "data/mice_experiment.xlsx", sheet = 12)
data.table::setDT(x = allsil_biolumi)

allsil_biolumi$days = as.numeric(as.character(allsil_biolumi$days))
allsil_biolumi = data.table::melt(data = allsil_biolumi, id.vars = "days")
allsil_biolumi$value = as.numeric(as.character(allsil_biolumi$value))
allsil_biolumi$treatment = data.table::tstrsplit(x = allsil_biolumi$variable, split = " ", keep = 1)
allsil_biolumi = allsil_biolumi[!is.na(value)]
allsil_biolumi[,value := log10(value)]
allsil_biolumi_stat = allsil_biolumi[,.(mean = mean(value), sem = sd(value)/.N), .(treatment, days)]
allsil_biolumi_stat[, sem_up := mean + sem]
allsil_biolumi_stat[, sem_low := mean - sem]

xrange = pretty(range(allsil_biolumi_stat$days))
yrange = pretty(range(allsil_biolumi_stat$sem_up))

allsil_biolumi_stat = split(allsil_biolumi_stat, as.factor(allsil_biolumi_stat$treatment))

par(mar = c(3, 3, 1, 1))
plot(NA, NA, xlim = range(xrange), ylim = range(yrange), frame.plot = FALSE, axes = FALSE, xlab = NA, ylab = NA)
abline(h = yrange, v = xrange, col = bg_col, lty = 2)
axis(side = 1, at = xrange, labels = xrange, lwd = 1, cex.axis = 0.8)
axis(side = 2, at = yrange, labels = yrange, las = 2, lwd = 1, cex.axis = 0.8)
mtext(text = "Days", side = 1, line = 2, font = 1, cex = 1)
mtext(text = "log10(photon/sec)", side = 2, line = 2, font = 1, cex = 0.8)

for(x in allsil_biolumi_stat){
  points(x = x$days, y = x$mean, col = col_codes[x$treatment], type = 'l', lwd = 1.8)
  segments(x0 = x$days, y0 = x$sem_low, x1 = x$days, y1 = x$sem_up, col = "black", lwd = 1.8)
  points(x = x$days, y = x$mean, col = 'black', pch = 21, bg = col_codes[x$treatment], cex = 1)
}

Session-info

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Pop!_OS 18.04 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] org.Hs.eg.db_3.12.0         AnnotationDbi_1.52.0       
##  [3] gtable_0.3.0                scales_1.1.1               
##  [5] peakSeason_0.1.0            ggplot2_3.3.3              
##  [7] RColorBrewer_1.1-2          ComplexHeatmap_2.6.2       
##  [9] circlize_0.4.12             pheatmap_1.0.12            
## [11] survival_3.2-7              umap_0.2.7.0               
## [13] fossil_0.4.0                shapefiles_0.7             
## [15] foreign_0.8-80              maps_3.3.0                 
## [17] sp_1.4-5                    NMF_0.23.0                 
## [19] cluster_2.1.0               rngtools_1.5               
## [21] pkgmaker_0.32.2             registry_0.5-1             
## [23] ape_5.4-1                   LOLA_1.20.0                
## [25] maftools_2.6.05             goseq_1.42.0               
## [27] geneLenDataBase_1.26.0      BiasedUrn_1.07             
## [29] sva_3.38.0                  BiocParallel_1.24.1        
## [31] genefilter_1.72.1           mgcv_1.8-33                
## [33] nlme_3.1-149                DESeq2_1.30.1              
## [35] SummarizedExperiment_1.20.0 Biobase_2.50.0             
## [37] MatrixGenerics_1.2.1        matrixStats_0.58.0         
## [39] GenomicRanges_1.42.0        GenomeInfoDb_1.26.2        
## [41] IRanges_2.24.1              S4Vectors_0.28.1           
## [43] BiocGenerics_0.36.0         limma_3.46.0               
## [45] readxl_1.3.1                data.table_1.14.0          
## 
## loaded via a namespace (and not attached):
##   [1] BiocFileCache_1.14.0     plyr_1.8.6               splines_4.0.2           
##   [4] gridBase_0.4-7           digest_0.6.27            foreach_1.5.1           
##   [7] htmltools_0.5.1.1        GO.db_3.12.1             fansi_0.4.2             
##  [10] magrittr_2.0.1           memoise_2.0.0            doParallel_1.0.16       
##  [13] Biostrings_2.58.0        annotate_1.68.0          R.utils_2.10.1          
##  [16] askpass_1.1              prettyunits_1.1.1        colorspace_2.0-0        
##  [19] blob_1.2.1               rappdirs_0.3.3           xfun_0.21               
##  [22] dplyr_1.0.4              crayon_1.4.1             RCurl_1.98-1.2          
##  [25] jsonlite_1.7.2           iterators_1.0.13         glue_1.4.2              
##  [28] zlibbioc_1.36.0          XVector_0.30.0           GetoptLong_1.0.5        
##  [31] DelayedArray_0.16.2      shape_1.4.5              abind_1.4-5             
##  [34] DBI_1.1.1                edgeR_3.32.1             Rcpp_1.0.6              
##  [37] xtable_1.8-4             progress_1.2.2           clue_0.3-58             
##  [40] reticulate_1.18          bit_4.0.4                httr_1.4.2              
##  [43] ellipsis_0.3.1           farver_2.1.0             R.methodsS3_1.8.1       
##  [46] pkgconfig_2.0.3          XML_3.99-0.5             sass_0.3.1              
##  [49] dbplyr_2.1.0             simpleCache_0.4.1        locfit_1.5-9.4          
##  [52] utf8_1.1.4               labeling_0.4.2           tidyselect_1.1.0        
##  [55] rlang_0.4.10             reshape2_1.4.4           munsell_0.5.0           
##  [58] cellranger_1.1.0         tools_4.0.2              cachem_1.0.4            
##  [61] generics_0.1.0           RSQLite_2.2.3            evaluate_0.14           
##  [64] stringr_1.4.0            fastmap_1.1.0            yaml_2.2.1              
##  [67] knitr_1.31               bit64_4.0.5              beanplot_1.2            
##  [70] purrr_0.3.4              R.oo_1.24.0              xml2_1.3.2              
##  [73] biomaRt_2.46.3           compiler_4.0.2           curl_4.3                
##  [76] png_0.1-7                tibble_3.1.0             geneplotter_1.68.0      
##  [79] bslib_0.2.4              stringi_1.5.3            highr_0.8               
##  [82] GenomicFeatures_1.42.1   RSpectra_0.16-0          lattice_0.20-41         
##  [85] Matrix_1.3-2             vctrs_0.3.6              pillar_1.5.0            
##  [88] lifecycle_1.0.0          jquerylib_0.1.3          GlobalOptions_0.1.2     
##  [91] bitops_1.0-6             qvalue_2.22.0            rtracklayer_1.50.0      
##  [94] R6_2.5.0                 writexl_1.3.1            codetools_0.2-16        
##  [97] assertthat_0.2.1         rjson_0.2.20             openssl_1.4.3           
## [100] withr_2.4.1              GenomicAlignments_1.26.0 Rsamtools_2.6.0         
## [103] GenomeInfoDbData_1.2.4   berryFunctions_1.19.1    hms_1.0.0               
## [106] rmarkdown_2.7            Cairo_1.5-12.2